Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -226,10 +226,6 @@ SCALEF, SCALEFS, - // Integer add/sub with signed saturation. - ADDS, - SUBS, - // Unsigned Integer average. AVG, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -830,14 +830,20 @@ } setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); - setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); + setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); // Use widening instead of promotion. for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16 }) { - setOperationAction(ISD::UADDSAT, VT, Custom); - setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); } setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); @@ -1212,9 +1218,13 @@ setOperationAction(ISD::UMIN, MVT::v4i64, Custom); setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); - setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); @@ -1334,7 +1344,9 @@ setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -1596,7 +1608,9 @@ setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); @@ -1678,7 +1692,9 @@ setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -23388,15 +23404,17 @@ return split256IntArith(Op, DAG); } -static SDValue LowerUADDSAT_USUBSAT(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getScalarType() == MVT::i1) { SDLoc dl(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Expected saturated arithmetic opcode"); case ISD::UADDSAT: + case ISD::SADDSAT: return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1)); case ISD::USUBSAT: + case ISD::SSUBSAT: return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), DAG.getNOT(dl, Op.getOperand(1), VT)); } @@ -26194,7 +26212,9 @@ case ISD::ADD: case ISD::SUB: return LowerADD_SUB(Op, DAG); case ISD::UADDSAT: - case ISD::USUBSAT: return LowerUADDSAT_USUBSAT(Op, DAG); + case ISD::SADDSAT: + case ISD::USUBSAT: + case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: @@ -26277,11 +26297,13 @@ return; } case ISD::UADDSAT: + case ISD::SADDSAT: case ISD::USUBSAT: + case ISD::SSUBSAT: case X86ISD::VPMADDWD: case X86ISD::AVG: { - // Legalize types for ISD::UADDSAT/USUBSAT and X86ISD::AVG/VPMADDWD - // by widening. + // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and + // X86ISD::AVG/VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -27228,8 +27250,6 @@ case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; - case X86ISD::ADDS: return "X86ISD::ADDS"; - case X86ISD::SUBS: return "X86ISD::SUBS"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -4830,9 +4830,9 @@ SchedWriteVecALU, 1>; defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, SchedWriteVecALU, 0>; -defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, +defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", saddsat, SchedWriteVecALU, HasBWI, 1>; -defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, +defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", ssubsat, SchedWriteVecALU, HasBWI, 0>; defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", uaddsat, SchedWriteVecALU, HasBWI, 1>; Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -227,8 +227,6 @@ SDTCisVec<1>, SDTCisSameAs<2, 1>]>; -def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>; -def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -3623,9 +3623,9 @@ SchedWriteVecALU, 1, NoVLX>; defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, SchedWriteVecALU, 1, NoVLX>; -defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8, +defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, +defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; @@ -3645,9 +3645,9 @@ SchedWriteVecALU, 0, NoVLX>; defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, SchedWriteVecALU, 0, NoVLX>; -defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8, +defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16, +defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -319,8 +319,8 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, ISD::SADDSAT, 0), + X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, ISD::SADDSAT, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), @@ -361,8 +361,8 @@ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, ISD::SSUBSAT, 0), + X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, ISD::SSUBSAT, 0), X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -920,8 +920,8 @@ X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_padds_b_512, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_padds_w_512, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_padds_b_512, INTR_TYPE_2OP, ISD::SADDSAT, 0), + X86_INTRINSIC_DATA(avx512_padds_w_512, INTR_TYPE_2OP, ISD::SADDSAT, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1004,8 +1004,8 @@ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psubs_b_512, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_psubs_w_512, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_psubs_b_512, INTR_TYPE_2OP, ISD::SSUBSAT, 0), + X86_INTRINSIC_DATA(avx512_psubs_w_512, INTR_TYPE_2OP, ISD::SSUBSAT, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), @@ -1168,8 +1168,8 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, ISD::SADDSAT, 0), + X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, ISD::SADDSAT, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1191,8 +1191,8 @@ X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, ISD::SSUBSAT, 0), + X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, ISD::SSUBSAT, 0), X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), Index: llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll +++ llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll @@ -34,13453 +34,160 @@ ; Legal types, depending on architecture. define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { -; SSE2-LABEL: v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB0_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB0_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB0_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB0_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB0_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB0_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB0_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: jno .LBB0_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB0_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB0_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB0_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: jno .LBB0_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB0_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: jno .LBB0_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB0_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: jno .LBB0_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB0_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB0_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB0_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB0_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB0_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB0_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB0_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB0_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB0_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB0_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB0_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB0_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB0_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %sil -; SSE2-NEXT: jno .LBB0_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB0_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB0_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB0_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB0_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB0_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB0_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB0_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB0_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB0_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: jno .LBB0_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB0_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB0_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB0_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: jno .LBB0_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB0_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: jno .LBB0_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB0_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: jno .LBB0_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB0_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB0_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB0_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB0_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB0_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB0_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB0_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB0_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB0_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB0_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB0_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB0_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB0_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %sil -; SSSE3-NEXT: jno .LBB0_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB0_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB0_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB0_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: jno .LBB0_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB0_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB0_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB0_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: jno .LBB0_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB0_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB0_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB0_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r15b -; SSE41-NEXT: jno .LBB0_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB0_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: jno .LBB0_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB0_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: jno .LBB0_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB0_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB0_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB0_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r9b -; SSE41-NEXT: jno .LBB0_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB0_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB0_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB0_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB0_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB0_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: addb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: jno .LBB0_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: addb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addb %bl, %cl -; SSE41-NEXT: jno .LBB0_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB0_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %dl -; SSE41-NEXT: jno .LBB0_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB0_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r8b -; SSE41-NEXT: jno .LBB0_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB0_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $15, %xmm1, %ecx -; AVX-NEXT: vpextrb $15, %xmm0, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dl -; AVX-NEXT: jno .LBB0_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: .LBB0_2: -; AVX-NEXT: vpextrb $14, %xmm1, %ecx -; AVX-NEXT: vpextrb $14, %xmm0, %r11d -; AVX-NEXT: movl %r11d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r11b -; AVX-NEXT: jno .LBB0_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: .LBB0_4: -; AVX-NEXT: vpextrb $13, %xmm1, %ecx -; AVX-NEXT: vpextrb $13, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dil -; AVX-NEXT: jno .LBB0_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB0_6: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrb $12, %xmm1, %ecx -; AVX-NEXT: vpextrb $12, %xmm0, %r14d -; AVX-NEXT: movl %r14d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r14b -; AVX-NEXT: jno .LBB0_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r14d -; AVX-NEXT: .LBB0_8: -; AVX-NEXT: vpextrb $11, %xmm1, %ecx -; AVX-NEXT: vpextrb $11, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %bpl -; AVX-NEXT: jno .LBB0_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB0_10: -; AVX-NEXT: vpextrb $10, %xmm1, %ecx -; AVX-NEXT: vpextrb $10, %xmm0, %r15d -; AVX-NEXT: movl %r15d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r15b -; AVX-NEXT: jno .LBB0_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r15d -; AVX-NEXT: .LBB0_12: -; AVX-NEXT: vpextrb $9, %xmm1, %ecx -; AVX-NEXT: vpextrb $9, %xmm0, %r12d -; AVX-NEXT: movl %r12d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r12b -; AVX-NEXT: jno .LBB0_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r12d -; AVX-NEXT: .LBB0_14: -; AVX-NEXT: vpextrb $8, %xmm1, %ecx -; AVX-NEXT: vpextrb $8, %xmm0, %r13d -; AVX-NEXT: movl %r13d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r13b -; AVX-NEXT: jno .LBB0_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r13d -; AVX-NEXT: .LBB0_16: -; AVX-NEXT: vpextrb $7, %xmm1, %ecx -; AVX-NEXT: vpextrb $7, %xmm0, %r10d -; AVX-NEXT: movl %r10d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r10b -; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB0_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r10d -; AVX-NEXT: .LBB0_18: -; AVX-NEXT: vpextrb $6, %xmm1, %ecx -; AVX-NEXT: vpextrb $6, %xmm0, %r9d -; AVX-NEXT: movl %r9d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r9b -; AVX-NEXT: jno .LBB0_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r9d -; AVX-NEXT: .LBB0_20: -; AVX-NEXT: vpextrb $5, %xmm1, %ecx -; AVX-NEXT: vpextrb $5, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %bpl -; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB0_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB0_22: -; AVX-NEXT: vpextrb $4, %xmm1, %ecx -; AVX-NEXT: vpextrb $4, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dil -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB0_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB0_24: -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: addb %dl, %cl -; AVX-NEXT: setns %cl -; AVX-NEXT: addb %dl, %al -; AVX-NEXT: jno .LBB0_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: addb $127, %cl -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_26: -; AVX-NEXT: vpextrb $2, %xmm1, %ebx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: addb %bl, %dl -; AVX-NEXT: setns %dl -; AVX-NEXT: addb %bl, %cl -; AVX-NEXT: jno .LBB0_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: addb $127, %dl -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: .LBB0_28: -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: addb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: addb %sil, %dl -; AVX-NEXT: jno .LBB0_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %edx -; AVX-NEXT: .LBB0_30: -; AVX-NEXT: vpextrb $1, %xmm1, %esi -; AVX-NEXT: vpextrb $1, %xmm0, %r8d -; AVX-NEXT: movl %r8d, %ebx -; AVX-NEXT: addb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: addb %sil, %r8b -; AVX-NEXT: jno .LBB0_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %r8d -; AVX-NEXT: .LBB0_32: -; AVX-NEXT: movzbl %dl, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: movzbl %r8b, %edx -; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %bpl, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r9b, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r10b, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r13b, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r12b, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r15b, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r14b, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r11b, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z } define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { -; SSE2-LABEL: v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB1_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB1_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB1_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB1_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB1_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB1_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB1_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jo .LBB1_9 -; SSE2-NEXT: # %bb.10: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB1_11 -; SSE2-NEXT: .LBB1_9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_11: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_13 -; SSE2-NEXT: # %bb.12: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB1_13: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB1_15 -; SSE2-NEXT: # %bb.14: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB1_15: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_17 -; SSE2-NEXT: # %bb.16: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB1_17: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: jno .LBB1_19 -; SSE2-NEXT: # %bb.18: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB1_19: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: jno .LBB1_21 -; SSE2-NEXT: # %bb.20: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB1_21: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB1_23 -; SSE2-NEXT: # %bb.22: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB1_23: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB1_25 -; SSE2-NEXT: # %bb.24: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_25: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: jno .LBB1_27 -; SSE2-NEXT: # %bb.26: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB1_27: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB1_29 -; SSE2-NEXT: # %bb.28: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB1_29: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: jno .LBB1_31 -; SSE2-NEXT: # %bb.30: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB1_31: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_33 -; SSE2-NEXT: # %bb.32: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_33: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB1_35 -; SSE2-NEXT: # %bb.34: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB1_35: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_37 -; SSE2-NEXT: # %bb.36: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_37: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB1_39 -; SSE2-NEXT: # %bb.38: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB1_39: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_41 -; SSE2-NEXT: # %bb.40: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_41: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_43 -; SSE2-NEXT: # %bb.42: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB1_43: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_45 -; SSE2-NEXT: # %bb.44: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB1_45: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_47 -; SSE2-NEXT: # %bb.46: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB1_47: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_49 -; SSE2-NEXT: # %bb.48: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB1_49: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_51 -; SSE2-NEXT: # %bb.50: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB1_51: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_53 -; SSE2-NEXT: # %bb.52: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB1_53: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_55 -; SSE2-NEXT: # %bb.54: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_55: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_57 -; SSE2-NEXT: # %bb.56: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB1_57: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_59 -; SSE2-NEXT: # %bb.58: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB1_59: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %r11b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_61 -; SSE2-NEXT: # %bb.60: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %r11d -; SSE2-NEXT: .LBB1_61: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: addb %bl, %dl -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addb %bl, %cl -; SSE2-NEXT: jno .LBB1_63 -; SSE2-NEXT: # %bb.62: -; SSE2-NEXT: addb $127, %dl -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB1_63: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: addb %al, %dl -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addb %al, %bl -; SSE2-NEXT: jno .LBB1_65 -; SSE2-NEXT: # %bb.64: -; SSE2-NEXT: addb $127, %dl -; SSE2-NEXT: movl %edx, %ebx -; SSE2-NEXT: .LBB1_65: -; SSE2-NEXT: movzbl %bl, %esi -; SSE2-NEXT: movzbl %cl, %edi -; SSE2-NEXT: movzbl %r11b, %r11d -; SSE2-NEXT: movzbl %r10b, %r10d -; SSE2-NEXT: movzbl %r9b, %r9d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movzbl %r13b, %r13d -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movzbl %r15b, %ebx -; SSE2-NEXT: movzbl %r14b, %edx -; SSE2-NEXT: movzbl %bpl, %ebp -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movd %esi, %xmm12 -; SSE2-NEXT: movd %edi, %xmm6 -; SSE2-NEXT: movd %r11d, %xmm11 -; SSE2-NEXT: movd %r10d, %xmm2 -; SSE2-NEXT: movd %r9d, %xmm10 -; SSE2-NEXT: movd %r8d, %xmm5 -; SSE2-NEXT: movd %r13d, %xmm9 -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movd %ebx, %xmm8 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movd %edx, %xmm14 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movd %ebp, %xmm13 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: movd %ecx, %xmm7 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %r12d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movd %r15d, %xmm4 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: movd %r14d, %xmm15 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movd %r13d, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: movd %r11d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: movd %r8d, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movd %r9d, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: movd %r10d, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE2-NEXT: movd %ebx, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movd %r12d, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; SSE2-NEXT: movd %esi, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: movd %r15d, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE2-NEXT: movd %r14d, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: movd %edx, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movd %r13d, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movd %r11d, %xmm2 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE2-NEXT: addq $8, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: pushq %rax -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB1_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB1_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB1_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB1_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB1_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB1_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB1_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jo .LBB1_9 -; SSSE3-NEXT: # %bb.10: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB1_11 -; SSSE3-NEXT: .LBB1_9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_11: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_13 -; SSSE3-NEXT: # %bb.12: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB1_13: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB1_15 -; SSSE3-NEXT: # %bb.14: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB1_15: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_17 -; SSSE3-NEXT: # %bb.16: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB1_17: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: jno .LBB1_19 -; SSSE3-NEXT: # %bb.18: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB1_19: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: jno .LBB1_21 -; SSSE3-NEXT: # %bb.20: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB1_21: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB1_23 -; SSSE3-NEXT: # %bb.22: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB1_23: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB1_25 -; SSSE3-NEXT: # %bb.24: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_25: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: jno .LBB1_27 -; SSSE3-NEXT: # %bb.26: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB1_27: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB1_29 -; SSSE3-NEXT: # %bb.28: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB1_29: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: jno .LBB1_31 -; SSSE3-NEXT: # %bb.30: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB1_31: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_33 -; SSSE3-NEXT: # %bb.32: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_33: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB1_35 -; SSSE3-NEXT: # %bb.34: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB1_35: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_37 -; SSSE3-NEXT: # %bb.36: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_37: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB1_39 -; SSSE3-NEXT: # %bb.38: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB1_39: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_41 -; SSSE3-NEXT: # %bb.40: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_41: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_43 -; SSSE3-NEXT: # %bb.42: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB1_43: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_45 -; SSSE3-NEXT: # %bb.44: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB1_45: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_47 -; SSSE3-NEXT: # %bb.46: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB1_47: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_49 -; SSSE3-NEXT: # %bb.48: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB1_49: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_51 -; SSSE3-NEXT: # %bb.50: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB1_51: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_53 -; SSSE3-NEXT: # %bb.52: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB1_53: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_55 -; SSSE3-NEXT: # %bb.54: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_55: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_57 -; SSSE3-NEXT: # %bb.56: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB1_57: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_59 -; SSSE3-NEXT: # %bb.58: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB1_59: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %r11b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_61 -; SSSE3-NEXT: # %bb.60: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %r11d -; SSSE3-NEXT: .LBB1_61: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %ecx, %edx -; SSSE3-NEXT: addb %bl, %dl -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addb %bl, %cl -; SSSE3-NEXT: jno .LBB1_63 -; SSSE3-NEXT: # %bb.62: -; SSSE3-NEXT: addb $127, %dl -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB1_63: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: addb %al, %dl -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addb %al, %bl -; SSSE3-NEXT: jno .LBB1_65 -; SSSE3-NEXT: # %bb.64: -; SSSE3-NEXT: addb $127, %dl -; SSSE3-NEXT: movl %edx, %ebx -; SSSE3-NEXT: .LBB1_65: -; SSSE3-NEXT: movzbl %bl, %esi -; SSSE3-NEXT: movzbl %cl, %edi -; SSSE3-NEXT: movzbl %r11b, %r11d -; SSSE3-NEXT: movzbl %r10b, %r10d -; SSSE3-NEXT: movzbl %r9b, %r9d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movzbl %r13b, %r13d -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movzbl %r15b, %ebx -; SSSE3-NEXT: movzbl %r14b, %edx -; SSSE3-NEXT: movzbl %bpl, %ebp -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movd %esi, %xmm12 -; SSSE3-NEXT: movd %edi, %xmm6 -; SSSE3-NEXT: movd %r11d, %xmm11 -; SSSE3-NEXT: movd %r10d, %xmm2 -; SSSE3-NEXT: movd %r9d, %xmm10 -; SSSE3-NEXT: movd %r8d, %xmm5 -; SSSE3-NEXT: movd %r13d, %xmm9 -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movd %ebx, %xmm8 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movd %edx, %xmm14 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movd %ebp, %xmm13 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: movd %ecx, %xmm7 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %r12d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movd %r15d, %xmm4 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: movd %r14d, %xmm15 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movd %r13d, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSSE3-NEXT: movd %r11d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSSE3-NEXT: movd %r8d, %xmm11 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSSE3-NEXT: movd %r9d, %xmm12 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSSE3-NEXT: movd %r10d, %xmm10 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSSE3-NEXT: movd %ebx, %xmm9 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSSE3-NEXT: movd %eax, %xmm5 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movd %r12d, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; SSSE3-NEXT: movd %esi, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSSE3-NEXT: movd %r15d, %xmm13 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSSE3-NEXT: movd %ebp, %xmm3 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSSE3-NEXT: movd %r14d, %xmm14 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSSE3-NEXT: movd %edx, %xmm15 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSSE3-NEXT: movd %r13d, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movd %r11d, %xmm2 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSSE3-NEXT: addq $8, %rsp -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm3, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB1_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB1_2: -; SSE41-NEXT: pextrb $14, %xmm3, %ecx -; SSE41-NEXT: pextrb $14, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: jno .LBB1_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_4: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $13, %xmm3, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jo .LBB1_5 -; SSE41-NEXT: # %bb.6: -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB1_7 -; SSE41-NEXT: .LBB1_5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB1_7: -; SSE41-NEXT: pextrb $12, %xmm3, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB1_9 -; SSE41-NEXT: # %bb.8: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_9: -; SSE41-NEXT: pextrb $11, %xmm3, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB1_11 -; SSE41-NEXT: # %bb.10: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_11: -; SSE41-NEXT: pextrb $10, %xmm3, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_13 -; SSE41-NEXT: # %bb.12: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB1_13: -; SSE41-NEXT: pextrb $9, %xmm3, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_15 -; SSE41-NEXT: # %bb.14: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_15: -; SSE41-NEXT: pextrb $8, %xmm3, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_17 -; SSE41-NEXT: # %bb.16: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_17: -; SSE41-NEXT: pextrb $7, %xmm3, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_19 -; SSE41-NEXT: # %bb.18: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB1_19: -; SSE41-NEXT: pextrb $6, %xmm3, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_21 -; SSE41-NEXT: # %bb.20: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_21: -; SSE41-NEXT: pextrb $5, %xmm3, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %r8d -; SSE41-NEXT: movl %r8d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r8b -; SSE41-NEXT: jno .LBB1_23 -; SSE41-NEXT: # %bb.22: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB1_23: -; SSE41-NEXT: pextrb $4, %xmm3, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: jno .LBB1_25 -; SSE41-NEXT: # %bb.24: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB1_25: -; SSE41-NEXT: pextrb $3, %xmm3, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: jno .LBB1_27 -; SSE41-NEXT: # %bb.26: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB1_27: -; SSE41-NEXT: pextrb $2, %xmm3, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: jno .LBB1_29 -; SSE41-NEXT: # %bb.28: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB1_29: -; SSE41-NEXT: pextrb $0, %xmm3, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r9b -; SSE41-NEXT: jno .LBB1_31 -; SSE41-NEXT: # %bb.30: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB1_31: -; SSE41-NEXT: pextrb $1, %xmm3, %ecx -; SSE41-NEXT: pextrb $1, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB1_33 -; SSE41-NEXT: # %bb.32: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_33: -; SSE41-NEXT: pextrb $15, %xmm2, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %esi, %r15d -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: jno .LBB1_35 -; SSE41-NEXT: # %bb.34: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB1_35: -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $14, %xmm2, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %edx, %edi -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: jno .LBB1_37 -; SSE41-NEXT: # %bb.36: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_37: -; SSE41-NEXT: pextrb $13, %xmm2, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB1_38 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB1_40 -; SSE41-NEXT: .LBB1_38: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB1_40: -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: pextrb $12, %xmm2, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: movl %r15d, %esi -; SSE41-NEXT: jno .LBB1_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_42: -; SSE41-NEXT: pextrb $11, %xmm2, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r15b -; SSE41-NEXT: jno .LBB1_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB1_44: -; SSE41-NEXT: pextrb $10, %xmm2, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: jno .LBB1_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB1_46: -; SSE41-NEXT: pextrb $9, %xmm2, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB1_48: -; SSE41-NEXT: pextrb $8, %xmm2, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB1_50: -; SSE41-NEXT: pextrb $7, %xmm2, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: jno .LBB1_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB1_52: -; SSE41-NEXT: pextrb $6, %xmm2, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %edi, %r14d -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB1_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_54: -; SSE41-NEXT: pextrb $5, %xmm2, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB1_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_56: -; SSE41-NEXT: pextrb $4, %xmm2, %edx -; SSE41-NEXT: pextrb $4, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: addb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: jno .LBB1_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB1_58: -; SSE41-NEXT: pextrb $3, %xmm2, %ebx -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: addb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addb %bl, %cl -; SSE41-NEXT: jno .LBB1_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_60: -; SSE41-NEXT: pextrb $2, %xmm2, %esi -; SSE41-NEXT: pextrb $2, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %dl -; SSE41-NEXT: jno .LBB1_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB1_62: -; SSE41-NEXT: pextrb $0, %xmm2, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r8b -; SSE41-NEXT: jno .LBB1_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB1_64: -; SSE41-NEXT: pextrb $1, %xmm2, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r9b -; SSE41-NEXT: jno .LBB1_66 -; SSE41-NEXT: # %bb.65: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r9d -; SSE41-NEXT: .LBB1_66: -; SSE41-NEXT: movzbl %r8b, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: movzbl %r9b, %esi -; SSE41-NEXT: pinsrb $1, %esi, %xmm0 -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: pinsrb $2, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v32i8: +; SSE: # %bb.0: +; SSE-NEXT: paddsb %xmm2, %xmm0 +; SSE-NEXT: paddsb %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: jo .LBB1_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB1_3 -; AVX1-NEXT: .LBB1_1: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB1_3: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: jno .LBB1_5 -; AVX1-NEXT: # %bb.4: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB1_5: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: jo .LBB1_6 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB1_8 -; AVX1-NEXT: .LBB1_6: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB1_8: -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: jno .LBB1_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_10: -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB1_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB1_12: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB1_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB1_14: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB1_16: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_18: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB1_20: -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB1_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB1_22: -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB1_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB1_24: -; AVX1-NEXT: vpextrb $4, %xmm1, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r11b -; AVX1-NEXT: jno .LBB1_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB1_26: -; AVX1-NEXT: vpextrb $3, %xmm1, %ecx -; AVX1-NEXT: vpextrb $3, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r14b -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB1_28: -; AVX1-NEXT: vpextrb $2, %xmm1, %ecx -; AVX1-NEXT: vpextrb $2, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r8b -; AVX1-NEXT: jno .LBB1_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB1_30: -; AVX1-NEXT: vpextrb $0, %xmm1, %ecx -; AVX1-NEXT: vpextrb $0, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r10b -; AVX1-NEXT: jno .LBB1_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB1_32: -; AVX1-NEXT: vpextrb $1, %xmm1, %ecx -; AVX1-NEXT: vpextrb $1, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r9b -; AVX1-NEXT: jno .LBB1_34 -; AVX1-NEXT: # %bb.33: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: .LBB1_34: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: movl %esi, %r12d -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: jno .LBB1_36 -; AVX1-NEXT: # %bb.35: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB1_36: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: movl %edx, %r13d -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: jno .LBB1_38 -; AVX1-NEXT: # %bb.37: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_38: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: movl %edi, %ebp -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: jno .LBB1_40 -; AVX1-NEXT: # %bb.39: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB1_40: -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jo .LBB1_41 -; AVX1-NEXT: # %bb.42: -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB1_43 -; AVX1-NEXT: .LBB1_41: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB1_43: -; AVX1-NEXT: movl %ebp, %edi -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r15b -; AVX1-NEXT: movl %r12d, %esi -; AVX1-NEXT: movl %r13d, %edx -; AVX1-NEXT: jno .LBB1_45 -; AVX1-NEXT: # %bb.44: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB1_45: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r12b -; AVX1-NEXT: jno .LBB1_47 -; AVX1-NEXT: # %bb.46: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB1_47: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r13b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_49 -; AVX1-NEXT: # %bb.48: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB1_49: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r11b -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_51 -; AVX1-NEXT: # %bb.50: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB1_51: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r10b -; AVX1-NEXT: jno .LBB1_53 -; AVX1-NEXT: # %bb.52: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB1_53: -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB1_55 -; AVX1-NEXT: # %bb.54: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB1_55: -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB1_57 -; AVX1-NEXT: # %bb.56: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB1_57: -; AVX1-NEXT: vpextrb $4, %xmm1, %edx -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addb %dl, %al -; AVX1-NEXT: jno .LBB1_59 -; AVX1-NEXT: # %bb.58: -; AVX1-NEXT: addb $127, %cl -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB1_59: -; AVX1-NEXT: vpextrb $3, %xmm1, %ebx -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: addb %bl, %dl -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: jno .LBB1_61 -; AVX1-NEXT: # %bb.60: -; AVX1-NEXT: addb $127, %dl -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: .LBB1_61: -; AVX1-NEXT: vpextrb $2, %xmm1, %esi -; AVX1-NEXT: vpextrb $2, %xmm0, %edx -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %dl -; AVX1-NEXT: jno .LBB1_63 -; AVX1-NEXT: # %bb.62: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: .LBB1_63: -; AVX1-NEXT: vpextrb $0, %xmm1, %esi -; AVX1-NEXT: vpextrb $0, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %r8b -; AVX1-NEXT: jo .LBB1_64 -; AVX1-NEXT: # %bb.65: -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX1-NEXT: jmp .LBB1_66 -; AVX1-NEXT: .LBB1_64: -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r8d -; AVX1-NEXT: .LBB1_66: -; AVX1-NEXT: vpextrb $1, %xmm1, %esi -; AVX1-NEXT: vpextrb $1, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %r9b -; AVX1-NEXT: jno .LBB1_68 -; AVX1-NEXT: # %bb.67: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r9d -; AVX1-NEXT: .LBB1_68: -; AVX1-NEXT: movzbl %r8b, %esi -; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: movzbl %r9b, %esi -; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dl, %edx -; AVX1-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r10b, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r11b, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: jo .LBB1_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB1_3 -; AVX2-NEXT: .LBB1_1: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB1_3: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: jno .LBB1_5 -; AVX2-NEXT: # %bb.4: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB1_5: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: jo .LBB1_6 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB1_8 -; AVX2-NEXT: .LBB1_6: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB1_8: -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: jno .LBB1_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_10: -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB1_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB1_12: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB1_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB1_14: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB1_16: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_18: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB1_20: -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB1_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB1_22: -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB1_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB1_24: -; AVX2-NEXT: vpextrb $4, %xmm1, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r11b -; AVX2-NEXT: jno .LBB1_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB1_26: -; AVX2-NEXT: vpextrb $3, %xmm1, %ecx -; AVX2-NEXT: vpextrb $3, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r14b -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB1_28: -; AVX2-NEXT: vpextrb $2, %xmm1, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r8b -; AVX2-NEXT: jno .LBB1_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB1_30: -; AVX2-NEXT: vpextrb $0, %xmm1, %ecx -; AVX2-NEXT: vpextrb $0, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r10b -; AVX2-NEXT: jno .LBB1_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB1_32: -; AVX2-NEXT: vpextrb $1, %xmm1, %ecx -; AVX2-NEXT: vpextrb $1, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r9b -; AVX2-NEXT: jno .LBB1_34 -; AVX2-NEXT: # %bb.33: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: .LBB1_34: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: movl %esi, %r12d -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: jno .LBB1_36 -; AVX2-NEXT: # %bb.35: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB1_36: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: movl %edx, %r13d -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: jno .LBB1_38 -; AVX2-NEXT: # %bb.37: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_38: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: movl %edi, %ebp -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: jno .LBB1_40 -; AVX2-NEXT: # %bb.39: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB1_40: -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jo .LBB1_41 -; AVX2-NEXT: # %bb.42: -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB1_43 -; AVX2-NEXT: .LBB1_41: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB1_43: -; AVX2-NEXT: movl %ebp, %edi -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r15b -; AVX2-NEXT: movl %r12d, %esi -; AVX2-NEXT: movl %r13d, %edx -; AVX2-NEXT: jno .LBB1_45 -; AVX2-NEXT: # %bb.44: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB1_45: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r12b -; AVX2-NEXT: jno .LBB1_47 -; AVX2-NEXT: # %bb.46: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB1_47: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r13b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_49 -; AVX2-NEXT: # %bb.48: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB1_49: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r11b -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_51 -; AVX2-NEXT: # %bb.50: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB1_51: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r10b -; AVX2-NEXT: jno .LBB1_53 -; AVX2-NEXT: # %bb.52: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB1_53: -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB1_55 -; AVX2-NEXT: # %bb.54: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB1_55: -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB1_57 -; AVX2-NEXT: # %bb.56: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB1_57: -; AVX2-NEXT: vpextrb $4, %xmm1, %edx -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addb %dl, %al -; AVX2-NEXT: jno .LBB1_59 -; AVX2-NEXT: # %bb.58: -; AVX2-NEXT: addb $127, %cl -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB1_59: -; AVX2-NEXT: vpextrb $3, %xmm1, %ebx -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: addb %bl, %dl -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: jno .LBB1_61 -; AVX2-NEXT: # %bb.60: -; AVX2-NEXT: addb $127, %dl -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: .LBB1_61: -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: vpextrb $2, %xmm0, %edx -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %dl -; AVX2-NEXT: jno .LBB1_63 -; AVX2-NEXT: # %bb.62: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: .LBB1_63: -; AVX2-NEXT: vpextrb $0, %xmm1, %esi -; AVX2-NEXT: vpextrb $0, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %r8b -; AVX2-NEXT: jo .LBB1_64 -; AVX2-NEXT: # %bb.65: -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX2-NEXT: jmp .LBB1_66 -; AVX2-NEXT: .LBB1_64: -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r8d -; AVX2-NEXT: .LBB1_66: -; AVX2-NEXT: vpextrb $1, %xmm1, %esi -; AVX2-NEXT: vpextrb $1, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %r9b -; AVX2-NEXT: jno .LBB1_68 -; AVX2-NEXT: # %bb.67: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r9d -; AVX2-NEXT: .LBB1_68: -; AVX2-NEXT: movzbl %r8b, %esi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: movzbl %r9b, %esi -; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: jo .LBB1_1 -; AVX512-NEXT: # %bb.2: -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB1_3 -; AVX512-NEXT: .LBB1_1: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB1_3: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: jno .LBB1_5 -; AVX512-NEXT: # %bb.4: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB1_5: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: jo .LBB1_6 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB1_8 -; AVX512-NEXT: .LBB1_6: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB1_8: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: jno .LBB1_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_10: -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB1_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB1_12: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB1_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB1_14: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB1_16: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_18: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB1_20: -; AVX512-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512-NEXT: vpextrb $6, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB1_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB1_22: -; AVX512-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512-NEXT: vpextrb $5, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB1_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB1_24: -; AVX512-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512-NEXT: vpextrb $4, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r11b -; AVX512-NEXT: jno .LBB1_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB1_26: -; AVX512-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512-NEXT: vpextrb $3, %xmm0, %r14d -; AVX512-NEXT: movl %r14d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r14b -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: .LBB1_28: -; AVX512-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512-NEXT: vpextrb $2, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r8b -; AVX512-NEXT: jno .LBB1_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB1_30: -; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %r10d -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r10b -; AVX512-NEXT: jno .LBB1_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r10d -; AVX512-NEXT: .LBB1_32: -; AVX512-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512-NEXT: vpextrb $1, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r9b -; AVX512-NEXT: jno .LBB1_34 -; AVX512-NEXT: # %bb.33: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r9d -; AVX512-NEXT: .LBB1_34: -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpextrb $15, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: movl %esi, %r12d -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: jno .LBB1_36 -; AVX512-NEXT: # %bb.35: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB1_36: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: movl %edx, %r13d -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: jno .LBB1_38 -; AVX512-NEXT: # %bb.37: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_38: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: movl %edi, %ebp -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: jno .LBB1_40 -; AVX512-NEXT: # %bb.39: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB1_40: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jo .LBB1_41 -; AVX512-NEXT: # %bb.42: -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB1_43 -; AVX512-NEXT: .LBB1_41: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB1_43: -; AVX512-NEXT: movl %ebp, %edi -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r15b -; AVX512-NEXT: movl %r12d, %esi -; AVX512-NEXT: movl %r13d, %edx -; AVX512-NEXT: jno .LBB1_45 -; AVX512-NEXT: # %bb.44: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB1_45: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r12b -; AVX512-NEXT: jno .LBB1_47 -; AVX512-NEXT: # %bb.46: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: .LBB1_47: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r13b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_49 -; AVX512-NEXT: # %bb.48: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB1_49: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r11b -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_51 -; AVX512-NEXT: # %bb.50: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB1_51: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %r10d -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r10b -; AVX512-NEXT: jno .LBB1_53 -; AVX512-NEXT: # %bb.52: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r10d -; AVX512-NEXT: .LBB1_53: -; AVX512-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512-NEXT: vpextrb $6, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB1_55 -; AVX512-NEXT: # %bb.54: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB1_55: -; AVX512-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512-NEXT: vpextrb $5, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB1_57 -; AVX512-NEXT: # %bb.56: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB1_57: -; AVX512-NEXT: vpextrb $4, %xmm1, %edx -; AVX512-NEXT: vpextrb $4, %xmm0, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %dl, %al -; AVX512-NEXT: jno .LBB1_59 -; AVX512-NEXT: # %bb.58: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB1_59: -; AVX512-NEXT: vpextrb $3, %xmm1, %ebx -; AVX512-NEXT: vpextrb $3, %xmm0, %ecx -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: addb %bl, %dl -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: jno .LBB1_61 -; AVX512-NEXT: # %bb.60: -; AVX512-NEXT: addb $127, %dl -; AVX512-NEXT: movl %edx, %ecx -; AVX512-NEXT: .LBB1_61: -; AVX512-NEXT: vpextrb $2, %xmm1, %esi -; AVX512-NEXT: vpextrb $2, %xmm0, %edx -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %dl -; AVX512-NEXT: jno .LBB1_63 -; AVX512-NEXT: # %bb.62: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: .LBB1_63: -; AVX512-NEXT: vpextrb $0, %xmm1, %esi -; AVX512-NEXT: vpextrb $0, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %r8b -; AVX512-NEXT: jo .LBB1_64 -; AVX512-NEXT: # %bb.65: -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX512-NEXT: jmp .LBB1_66 -; AVX512-NEXT: .LBB1_64: -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r8d -; AVX512-NEXT: .LBB1_66: -; AVX512-NEXT: vpextrb $1, %xmm1, %esi -; AVX512-NEXT: vpextrb $1, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %r9b -; AVX512-NEXT: jno .LBB1_68 -; AVX512-NEXT: # %bb.67: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r9d -; AVX512-NEXT: .LBB1_68: -; AVX512-NEXT: movzbl %r8b, %esi -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: movzbl %r9b, %esi -; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dl, %edx -; AVX512-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %cl, %ecx -; AVX512-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dil, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %bpl, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r10b, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r11b, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r13b, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r12b, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r15b, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl %r14b, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z } define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { -; SSE2-LABEL: v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $232, %rsp -; SSE2-NEXT: movaps %xmm5, (%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb (%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB2_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_2: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB2_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_4: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB2_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_6: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB2_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_8: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB2_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_10: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: jno .LBB2_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_12: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_14: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jo .LBB2_15 -; SSE2-NEXT: # %bb.16: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_17 -; SSE2-NEXT: .LBB2_15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_17: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB2_19 -; SSE2-NEXT: # %bb.18: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_19: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB2_21 -; SSE2-NEXT: # %bb.20: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_21: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_23 -; SSE2-NEXT: # %bb.22: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_23: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB2_25 -; SSE2-NEXT: # %bb.24: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_25: -; SSE2-NEXT: movl %edi, %r8d -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jo .LBB2_26 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movl %esi, %edi -; SSE2-NEXT: jmp .LBB2_28 -; SSE2-NEXT: .LBB2_26: -; SSE2-NEXT: movl %esi, %edi -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_28: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_30: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB2_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_32: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_34 -; SSE2-NEXT: # %bb.33: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_34: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB2_36 -; SSE2-NEXT: # %bb.35: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_36: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_38 -; SSE2-NEXT: # %bb.37: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_38: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_40 -; SSE2-NEXT: # %bb.39: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_40: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_42 -; SSE2-NEXT: # %bb.41: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB2_42: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_44 -; SSE2-NEXT: # %bb.43: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_44: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB2_46 -; SSE2-NEXT: # %bb.45: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_46: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB2_48 -; SSE2-NEXT: # %bb.47: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_48: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_50 -; SSE2-NEXT: # %bb.49: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_50: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_52 -; SSE2-NEXT: # %bb.51: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_52: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB2_54 -; SSE2-NEXT: # %bb.53: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_54: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_56 -; SSE2-NEXT: # %bb.55: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_56: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_58 -; SSE2-NEXT: # %bb.57: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_58: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_60 -; SSE2-NEXT: # %bb.59: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_60: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB2_62 -; SSE2-NEXT: # %bb.61: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_62: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: jno .LBB2_64 -; SSE2-NEXT: # %bb.63: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_64: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jo .LBB2_65 -; SSE2-NEXT: # %bb.66: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_67 -; SSE2-NEXT: .LBB2_65: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_67: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB2_69 -; SSE2-NEXT: # %bb.68: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_69: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_71 -; SSE2-NEXT: # %bb.70: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_71: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB2_73 -; SSE2-NEXT: # %bb.72: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_73: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: jno .LBB2_75 -; SSE2-NEXT: # %bb.74: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_75: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: jno .LBB2_77 -; SSE2-NEXT: # %bb.76: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_77: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: jno .LBB2_79 -; SSE2-NEXT: # %bb.78: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB2_79: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: jno .LBB2_81 -; SSE2-NEXT: # %bb.80: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB2_81: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_83 -; SSE2-NEXT: # %bb.82: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_83: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jo .LBB2_84 -; SSE2-NEXT: # %bb.85: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_86 -; SSE2-NEXT: .LBB2_84: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_86: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_88 -; SSE2-NEXT: # %bb.87: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_88: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jo .LBB2_89 -; SSE2-NEXT: # %bb.90: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_91 -; SSE2-NEXT: .LBB2_89: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_91: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_93 -; SSE2-NEXT: # %bb.92: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_93: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB2_95 -; SSE2-NEXT: # %bb.94: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB2_95: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_97 -; SSE2-NEXT: # %bb.96: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_97: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB2_99 -; SSE2-NEXT: # %bb.98: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_99: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_101 -; SSE2-NEXT: # %bb.100: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_101: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB2_103 -; SSE2-NEXT: # %bb.102: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_103: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: jno .LBB2_105 -; SSE2-NEXT: # %bb.104: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_105: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB2_107 -; SSE2-NEXT: # %bb.106: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_107: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_109 -; SSE2-NEXT: # %bb.108: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_109: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_111 -; SSE2-NEXT: # %bb.110: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_111: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_113 -; SSE2-NEXT: # %bb.112: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_113: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: movl %r8d, %edx -; SSE2-NEXT: jno .LBB2_115 -; SSE2-NEXT: # %bb.114: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_115: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_117 -; SSE2-NEXT: # %bb.116: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB2_117: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB2_119 -; SSE2-NEXT: # %bb.118: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_119: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB2_121 -; SSE2-NEXT: # %bb.120: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_121: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB2_123 -; SSE2-NEXT: # %bb.122: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB2_123: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload -; SSE2-NEXT: jno .LBB2_125 -; SSE2-NEXT: # %bb.124: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_125: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB2_127 -; SSE2-NEXT: # %bb.126: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_127: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: addb %bl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %bl, %cl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_129 -; SSE2-NEXT: # %bb.128: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_129: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %dl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %dl, %bl -; SSE2-NEXT: jno .LBB2_131 -; SSE2-NEXT: # %bb.130: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_131: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %dl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %dl, %r9b -; SSE2-NEXT: jno .LBB2_133 -; SSE2-NEXT: # %bb.132: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_133: -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload -; SSE2-NEXT: movzbl %r9b, %ebp -; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %bl, %ebp -; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r11b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r10b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r8b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %dil, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r13b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r12b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r15b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r14b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %dl, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE2-NEXT: movd %r13d, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSE2-NEXT: movd %r12d, %xmm1 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: movd %r14d, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE2-NEXT: movd %ebx, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSE2-NEXT: movd %r11d, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE2-NEXT: movd %ecx, %xmm6 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: movd %edx, %xmm13 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE2-NEXT: movd %edi, %xmm5 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: movd %esi, %xmm15 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movd %r10d, %xmm10 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE2-NEXT: movd %r15d, %xmm2 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE2-NEXT: movd %r12d, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE2-NEXT: movd %r9d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE2-NEXT: movd %r11d, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE2-NEXT: movd %r14d, %xmm12 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSE2-NEXT: movd %r13d, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE2-NEXT: movd %esi, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: movd %r8d, %xmm15 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: movd %ebp, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movd %ecx, %xmm13 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: movd %r10d, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: movd %ebx, %xmm10 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movd %r15d, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE2-NEXT: movd %r12d, %xmm3 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: addq $232, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v64i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: subq $232, %rsp -; SSSE3-NEXT: movaps %xmm5, (%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb (%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB2_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_2: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB2_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_4: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB2_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_6: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB2_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_8: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB2_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_10: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: jno .LBB2_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_12: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_14: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jo .LBB2_15 -; SSSE3-NEXT: # %bb.16: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_17 -; SSSE3-NEXT: .LBB2_15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_17: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB2_19 -; SSSE3-NEXT: # %bb.18: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_19: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB2_21 -; SSSE3-NEXT: # %bb.20: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_21: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_23 -; SSSE3-NEXT: # %bb.22: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_23: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB2_25 -; SSSE3-NEXT: # %bb.24: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_25: -; SSSE3-NEXT: movl %edi, %r8d -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jo .LBB2_26 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movl %esi, %edi -; SSSE3-NEXT: jmp .LBB2_28 -; SSSE3-NEXT: .LBB2_26: -; SSSE3-NEXT: movl %esi, %edi -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_28: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_30: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB2_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_32: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_34 -; SSSE3-NEXT: # %bb.33: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_34: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB2_36 -; SSSE3-NEXT: # %bb.35: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_36: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_38 -; SSSE3-NEXT: # %bb.37: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_38: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_40 -; SSSE3-NEXT: # %bb.39: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_40: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_42 -; SSSE3-NEXT: # %bb.41: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB2_42: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_44 -; SSSE3-NEXT: # %bb.43: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_44: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB2_46 -; SSSE3-NEXT: # %bb.45: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_46: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB2_48 -; SSSE3-NEXT: # %bb.47: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_48: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_50 -; SSSE3-NEXT: # %bb.49: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_50: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_52 -; SSSE3-NEXT: # %bb.51: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_52: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB2_54 -; SSSE3-NEXT: # %bb.53: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_54: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_56 -; SSSE3-NEXT: # %bb.55: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_56: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_58 -; SSSE3-NEXT: # %bb.57: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_58: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_60 -; SSSE3-NEXT: # %bb.59: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_60: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB2_62 -; SSSE3-NEXT: # %bb.61: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_62: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: jno .LBB2_64 -; SSSE3-NEXT: # %bb.63: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_64: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jo .LBB2_65 -; SSSE3-NEXT: # %bb.66: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_67 -; SSSE3-NEXT: .LBB2_65: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_67: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB2_69 -; SSSE3-NEXT: # %bb.68: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_69: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_71 -; SSSE3-NEXT: # %bb.70: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_71: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB2_73 -; SSSE3-NEXT: # %bb.72: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_73: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: jno .LBB2_75 -; SSSE3-NEXT: # %bb.74: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_75: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: jno .LBB2_77 -; SSSE3-NEXT: # %bb.76: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_77: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: jno .LBB2_79 -; SSSE3-NEXT: # %bb.78: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB2_79: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: jno .LBB2_81 -; SSSE3-NEXT: # %bb.80: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB2_81: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_83 -; SSSE3-NEXT: # %bb.82: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_83: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jo .LBB2_84 -; SSSE3-NEXT: # %bb.85: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_86 -; SSSE3-NEXT: .LBB2_84: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_86: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_88 -; SSSE3-NEXT: # %bb.87: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_88: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jo .LBB2_89 -; SSSE3-NEXT: # %bb.90: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_91 -; SSSE3-NEXT: .LBB2_89: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_91: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_93 -; SSSE3-NEXT: # %bb.92: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_93: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB2_95 -; SSSE3-NEXT: # %bb.94: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB2_95: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_97 -; SSSE3-NEXT: # %bb.96: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_97: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB2_99 -; SSSE3-NEXT: # %bb.98: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_99: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_101 -; SSSE3-NEXT: # %bb.100: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_101: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB2_103 -; SSSE3-NEXT: # %bb.102: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_103: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: jno .LBB2_105 -; SSSE3-NEXT: # %bb.104: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_105: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB2_107 -; SSSE3-NEXT: # %bb.106: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_107: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_109 -; SSSE3-NEXT: # %bb.108: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_109: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_111 -; SSSE3-NEXT: # %bb.110: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_111: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_113 -; SSSE3-NEXT: # %bb.112: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_113: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: movl %r8d, %edx -; SSSE3-NEXT: jno .LBB2_115 -; SSSE3-NEXT: # %bb.114: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_115: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_117 -; SSSE3-NEXT: # %bb.116: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB2_117: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB2_119 -; SSSE3-NEXT: # %bb.118: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_119: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB2_121 -; SSSE3-NEXT: # %bb.120: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_121: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB2_123 -; SSSE3-NEXT: # %bb.122: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB2_123: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload -; SSSE3-NEXT: jno .LBB2_125 -; SSSE3-NEXT: # %bb.124: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_125: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB2_127 -; SSSE3-NEXT: # %bb.126: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_127: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: addb %bl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %bl, %cl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_129 -; SSSE3-NEXT: # %bb.128: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_129: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %dl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %dl, %bl -; SSSE3-NEXT: jno .LBB2_131 -; SSSE3-NEXT: # %bb.130: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_131: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %dl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %dl, %r9b -; SSSE3-NEXT: jno .LBB2_133 -; SSSE3-NEXT: # %bb.132: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_133: -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload -; SSSE3-NEXT: movzbl %r9b, %ebp -; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %bl, %ebp -; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r11b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r10b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r8b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %dil, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r13b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r12b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r15b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r14b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %dl, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSSE3-NEXT: movd %r13d, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSSE3-NEXT: movd %r12d, %xmm1 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSSE3-NEXT: movd %r14d, %xmm14 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSSE3-NEXT: movd %ebp, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSSE3-NEXT: movd %ebx, %xmm12 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSSE3-NEXT: movd %r11d, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSSE3-NEXT: movd %ecx, %xmm6 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSSE3-NEXT: movd %edx, %xmm13 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSSE3-NEXT: movd %edi, %xmm5 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSSE3-NEXT: movd %esi, %xmm15 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movd %r10d, %xmm10 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSSE3-NEXT: movd %r15d, %xmm2 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSSE3-NEXT: movd %r12d, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSSE3-NEXT: movd %r9d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSSE3-NEXT: movd %r11d, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSSE3-NEXT: movd %r14d, %xmm12 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSSE3-NEXT: movd %r13d, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSSE3-NEXT: movd %esi, %xmm14 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSSE3-NEXT: movd %r8d, %xmm15 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSSE3-NEXT: movd %ebp, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movd %ecx, %xmm13 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: movd %r10d, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: movd %ebx, %xmm10 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movd %r15d, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSSE3-NEXT: movd %eax, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSSE3-NEXT: movd %r12d, %xmm3 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: addq $232, %rsp -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm5, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: jno .LBB2_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB2_2: -; SSE41-NEXT: pextrb $14, %xmm5, %ecx -; SSE41-NEXT: pextrb $14, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: jno .LBB2_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_4: -; SSE41-NEXT: pextrb $13, %xmm5, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB2_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_6: -; SSE41-NEXT: pextrb $12, %xmm5, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB2_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_8: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: subq $76, %rsp -; SSE41-NEXT: pextrb $11, %xmm5, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB2_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_10: -; SSE41-NEXT: pextrb $10, %xmm5, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_12: -; SSE41-NEXT: pextrb $9, %xmm5, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB2_13 -; SSE41-NEXT: # %bb.14: -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB2_15 -; SSE41-NEXT: .LBB2_13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_15: -; SSE41-NEXT: pextrb $8, %xmm5, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB2_17 -; SSE41-NEXT: # %bb.16: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_17: -; SSE41-NEXT: pextrb $7, %xmm5, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_19 -; SSE41-NEXT: # %bb.18: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_19: -; SSE41-NEXT: pextrb $6, %xmm5, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_21 -; SSE41-NEXT: # %bb.20: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_21: -; SSE41-NEXT: pextrb $5, %xmm5, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB2_22 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB2_24 -; SSE41-NEXT: .LBB2_22: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_24: -; SSE41-NEXT: pextrb $4, %xmm5, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: jno .LBB2_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB2_26: -; SSE41-NEXT: pextrb $3, %xmm5, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: jno .LBB2_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_28: -; SSE41-NEXT: pextrb $2, %xmm5, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_30: -; SSE41-NEXT: pextrb $0, %xmm5, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_32: -; SSE41-NEXT: pextrb $1, %xmm5, %ecx -; SSE41-NEXT: pextrb $1, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_34 -; SSE41-NEXT: # %bb.33: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_34: -; SSE41-NEXT: pextrb $15, %xmm6, %ecx -; SSE41-NEXT: pextrb $15, %xmm2, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r9b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_36 -; SSE41-NEXT: # %bb.35: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB2_36: -; SSE41-NEXT: pextrb $14, %xmm6, %ecx -; SSE41-NEXT: pextrb $14, %xmm2, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_38 -; SSE41-NEXT: # %bb.37: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_38: -; SSE41-NEXT: pextrb $13, %xmm6, %ecx -; SSE41-NEXT: pextrb $13, %xmm2, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_40 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB2_40: -; SSE41-NEXT: pextrb $12, %xmm6, %ecx -; SSE41-NEXT: pextrb $12, %xmm2, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: jno .LBB2_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB2_42: -; SSE41-NEXT: pextrb $11, %xmm6, %ecx -; SSE41-NEXT: pextrb $11, %xmm2, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_44: -; SSE41-NEXT: pextrb $10, %xmm6, %ecx -; SSE41-NEXT: pextrb $10, %xmm2, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB2_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_46: -; SSE41-NEXT: pextrb $9, %xmm6, %ecx -; SSE41-NEXT: pextrb $9, %xmm2, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB2_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_48: -; SSE41-NEXT: pextrb $8, %xmm6, %ecx -; SSE41-NEXT: pextrb $8, %xmm2, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB2_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_50: -; SSE41-NEXT: pextrb $7, %xmm6, %ecx -; SSE41-NEXT: pextrb $7, %xmm2, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_52: -; SSE41-NEXT: pextrb $6, %xmm6, %ecx -; SSE41-NEXT: pextrb $6, %xmm2, %r8d -; SSE41-NEXT: movl %r8d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r8b -; SSE41-NEXT: jno .LBB2_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB2_54: -; SSE41-NEXT: pextrb $5, %xmm6, %ecx -; SSE41-NEXT: pextrb $5, %xmm2, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB2_56: -; SSE41-NEXT: pextrb $4, %xmm6, %ecx -; SSE41-NEXT: pextrb $4, %xmm2, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_58: -; SSE41-NEXT: pextrb $3, %xmm6, %ecx -; SSE41-NEXT: pextrb $3, %xmm2, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_60: -; SSE41-NEXT: pextrb $2, %xmm6, %ecx -; SSE41-NEXT: pextrb $2, %xmm2, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_62: -; SSE41-NEXT: pextrb $0, %xmm6, %ecx -; SSE41-NEXT: pextrb $0, %xmm2, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB2_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_64: -; SSE41-NEXT: pextrb $1, %xmm6, %ecx -; SSE41-NEXT: pextrb $1, %xmm2, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_66 -; SSE41-NEXT: # %bb.65: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_66: -; SSE41-NEXT: pextrb $15, %xmm7, %ecx -; SSE41-NEXT: pextrb $15, %xmm3, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_68 -; SSE41-NEXT: # %bb.67: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_68: -; SSE41-NEXT: pextrb $14, %xmm7, %ecx -; SSE41-NEXT: pextrb $14, %xmm3, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_70 -; SSE41-NEXT: # %bb.69: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_70: -; SSE41-NEXT: pextrb $13, %xmm7, %ecx -; SSE41-NEXT: pextrb $13, %xmm3, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_72 -; SSE41-NEXT: # %bb.71: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_72: -; SSE41-NEXT: pextrb $12, %xmm7, %ecx -; SSE41-NEXT: pextrb $12, %xmm3, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r15b -; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_74 -; SSE41-NEXT: # %bb.73: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_74: -; SSE41-NEXT: pextrb $11, %xmm7, %ecx -; SSE41-NEXT: pextrb $11, %xmm3, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB2_76 -; SSE41-NEXT: # %bb.75: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_76: -; SSE41-NEXT: pextrb $10, %xmm7, %ecx -; SSE41-NEXT: pextrb $10, %xmm3, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_78 -; SSE41-NEXT: # %bb.77: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_78: -; SSE41-NEXT: pextrb $9, %xmm7, %ecx -; SSE41-NEXT: pextrb $9, %xmm3, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_80 -; SSE41-NEXT: # %bb.79: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB2_80: -; SSE41-NEXT: pextrb $8, %xmm7, %ecx -; SSE41-NEXT: pextrb $8, %xmm3, %r8d -; SSE41-NEXT: movl %r8d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r8b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_82 -; SSE41-NEXT: # %bb.81: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB2_82: -; SSE41-NEXT: pextrb $7, %xmm7, %ecx -; SSE41-NEXT: pextrb $7, %xmm3, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r9b -; SSE41-NEXT: jno .LBB2_84 -; SSE41-NEXT: # %bb.83: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB2_84: -; SSE41-NEXT: pextrb $6, %xmm7, %ecx -; SSE41-NEXT: pextrb $6, %xmm3, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: jno .LBB2_86 -; SSE41-NEXT: # %bb.85: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB2_86: -; SSE41-NEXT: pextrb $5, %xmm7, %ecx -; SSE41-NEXT: pextrb $5, %xmm3, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: jno .LBB2_88 -; SSE41-NEXT: # %bb.87: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB2_88: -; SSE41-NEXT: pextrb $4, %xmm7, %ecx -; SSE41-NEXT: pextrb $4, %xmm3, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_90 -; SSE41-NEXT: # %bb.89: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_90: -; SSE41-NEXT: pextrb $3, %xmm7, %ecx -; SSE41-NEXT: pextrb $3, %xmm3, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_92 -; SSE41-NEXT: # %bb.91: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB2_92: -; SSE41-NEXT: pextrb $2, %xmm7, %ecx -; SSE41-NEXT: pextrb $2, %xmm3, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_94 -; SSE41-NEXT: # %bb.93: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_94: -; SSE41-NEXT: pextrb $0, %xmm7, %ecx -; SSE41-NEXT: pextrb $0, %xmm3, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %edi, %r15d -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB2_96 -; SSE41-NEXT: # %bb.95: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_96: -; SSE41-NEXT: pextrb $1, %xmm7, %ecx -; SSE41-NEXT: pextrb $1, %xmm3, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB2_98 -; SSE41-NEXT: # %bb.97: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_98: -; SSE41-NEXT: pextrb $15, %xmm4, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_100 -; SSE41-NEXT: # %bb.99: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_100: -; SSE41-NEXT: pextrb $14, %xmm4, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB2_102 -; SSE41-NEXT: # %bb.101: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_102: -; SSE41-NEXT: pextrb $13, %xmm4, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bl -; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB2_103 -; SSE41-NEXT: # %bb.104: -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB2_105 -; SSE41-NEXT: .LBB2_103: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_105: -; SSE41-NEXT: pextrb $12, %xmm4, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: jno .LBB2_107 -; SSE41-NEXT: # %bb.106: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB2_107: -; SSE41-NEXT: pextrb $11, %xmm4, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_109 -; SSE41-NEXT: # %bb.108: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB2_109: -; SSE41-NEXT: pextrb $10, %xmm4, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r15b -; SSE41-NEXT: jno .LBB2_111 -; SSE41-NEXT: # %bb.110: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB2_111: -; SSE41-NEXT: pextrb $9, %xmm4, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: jno .LBB2_113 -; SSE41-NEXT: # %bb.112: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB2_113: -; SSE41-NEXT: pextrb $8, %xmm4, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB2_115 -; SSE41-NEXT: # %bb.114: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_115: -; SSE41-NEXT: pextrb $7, %xmm4, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB2_117 -; SSE41-NEXT: # %bb.116: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_117: -; SSE41-NEXT: pextrb $6, %xmm4, %edx -; SSE41-NEXT: pextrb $6, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: addb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: jno .LBB2_119 -; SSE41-NEXT: # %bb.118: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB2_119: -; SSE41-NEXT: pextrb $5, %xmm4, %ebx -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: addb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addb %bl, %cl -; SSE41-NEXT: jno .LBB2_121 -; SSE41-NEXT: # %bb.120: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB2_121: -; SSE41-NEXT: pextrb $4, %xmm4, %esi -; SSE41-NEXT: pextrb $4, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %dl -; SSE41-NEXT: jno .LBB2_123 -; SSE41-NEXT: # %bb.122: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB2_123: -; SSE41-NEXT: pextrb $3, %xmm4, %esi -; SSE41-NEXT: pextrb $3, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r8b -; SSE41-NEXT: jno .LBB2_125 -; SSE41-NEXT: # %bb.124: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB2_125: -; SSE41-NEXT: pextrb $2, %xmm4, %esi -; SSE41-NEXT: pextrb $2, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r9b -; SSE41-NEXT: jno .LBB2_127 -; SSE41-NEXT: # %bb.126: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r9d -; SSE41-NEXT: .LBB2_127: -; SSE41-NEXT: pextrb $0, %xmm4, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r10b -; SSE41-NEXT: jno .LBB2_129 -; SSE41-NEXT: # %bb.128: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r10d -; SSE41-NEXT: .LBB2_129: -; SSE41-NEXT: pextrb $1, %xmm4, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r11b -; SSE41-NEXT: jno .LBB2_131 -; SSE41-NEXT: # %bb.130: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r11d -; SSE41-NEXT: .LBB2_131: -; SSE41-NEXT: movzbl %r10b, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: movzbl %r11b, %esi -; SSE41-NEXT: pinsrb $1, %esi, %xmm0 -; SSE41-NEXT: movzbl %r9b, %esi -; SSE41-NEXT: pinsrb $2, %esi, %xmm0 -; SSE41-NEXT: movzbl %r8b, %esi -; SSE41-NEXT: pinsrb $3, %esi, %xmm0 -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: pinsrb $4, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm2 -; SSE41-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm3 -; SSE41-NEXT: addq $76, %rsp -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v64i8: +; SSE: # %bb.0: +; SSE-NEXT: paddsb %xmm4, %xmm0 +; SSE-NEXT: paddsb %xmm5, %xmm1 +; SSE-NEXT: paddsb %xmm6, %xmm2 +; SSE-NEXT: paddsb %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $76, %rsp -; AVX1-NEXT: vpextrb $15, %xmm3, %ecx -; AVX1-NEXT: vpextrb $15, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: jo .LBB2_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_3 -; AVX1-NEXT: .LBB2_1: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_3: -; AVX1-NEXT: vpextrb $14, %xmm3, %ecx -; AVX1-NEXT: vpextrb $14, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: jno .LBB2_5 -; AVX1-NEXT: # %bb.4: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_5: -; AVX1-NEXT: vpextrb $13, %xmm3, %ecx -; AVX1-NEXT: vpextrb $13, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: jo .LBB2_6 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_8 -; AVX1-NEXT: .LBB2_6: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_8: -; AVX1-NEXT: vpextrb $12, %xmm3, %ecx -; AVX1-NEXT: vpextrb $12, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: jno .LBB2_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_10: -; AVX1-NEXT: vpextrb $11, %xmm3, %ecx -; AVX1-NEXT: vpextrb $11, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB2_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_12: -; AVX1-NEXT: vpextrb $10, %xmm3, %ecx -; AVX1-NEXT: vpextrb $10, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB2_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_14: -; AVX1-NEXT: vpextrb $9, %xmm3, %ecx -; AVX1-NEXT: vpextrb $9, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jo .LBB2_15 -; AVX1-NEXT: # %bb.16: -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_17 -; AVX1-NEXT: .LBB2_15: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_17: -; AVX1-NEXT: vpextrb $8, %xmm3, %ecx -; AVX1-NEXT: vpextrb $8, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB2_19 -; AVX1-NEXT: # %bb.18: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_19: -; AVX1-NEXT: vpextrb $7, %xmm3, %ecx -; AVX1-NEXT: vpextrb $7, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_21 -; AVX1-NEXT: # %bb.20: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_21: -; AVX1-NEXT: vpextrb $6, %xmm3, %ecx -; AVX1-NEXT: vpextrb $6, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_23 -; AVX1-NEXT: # %bb.22: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_23: -; AVX1-NEXT: vpextrb $5, %xmm3, %ecx -; AVX1-NEXT: vpextrb $5, %xmm1, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r11b -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_25 -; AVX1-NEXT: # %bb.24: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_25: -; AVX1-NEXT: vpextrb $4, %xmm3, %ecx -; AVX1-NEXT: vpextrb $4, %xmm1, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r13b -; AVX1-NEXT: jno .LBB2_27 -; AVX1-NEXT: # %bb.26: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB2_27: -; AVX1-NEXT: vpextrb $3, %xmm3, %ecx -; AVX1-NEXT: vpextrb $3, %xmm1, %r8d -; AVX1-NEXT: movl %r8d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r8b -; AVX1-NEXT: jno .LBB2_29 -; AVX1-NEXT: # %bb.28: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB2_29: -; AVX1-NEXT: vpextrb $2, %xmm3, %ecx -; AVX1-NEXT: vpextrb $2, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB2_31 -; AVX1-NEXT: # %bb.30: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_31: -; AVX1-NEXT: vpextrb $0, %xmm3, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: jno .LBB2_33 -; AVX1-NEXT: # %bb.32: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_33: -; AVX1-NEXT: vpextrb $1, %xmm3, %ecx -; AVX1-NEXT: vpextrb $1, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB2_35 -; AVX1-NEXT: # %bb.34: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_35: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpextrb $15, %xmm3, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_37 -; AVX1-NEXT: # %bb.36: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_37: -; AVX1-NEXT: vpextrb $14, %xmm3, %ecx -; AVX1-NEXT: vpextrb $14, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_39 -; AVX1-NEXT: # %bb.38: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_39: -; AVX1-NEXT: vpextrb $13, %xmm3, %ecx -; AVX1-NEXT: vpextrb $13, %xmm1, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r12b -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_41 -; AVX1-NEXT: # %bb.40: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB2_41: -; AVX1-NEXT: vpextrb $12, %xmm3, %ecx -; AVX1-NEXT: vpextrb $12, %xmm1, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r15b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_43 -; AVX1-NEXT: # %bb.42: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB2_43: -; AVX1-NEXT: vpextrb $11, %xmm3, %ecx -; AVX1-NEXT: vpextrb $11, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_45 -; AVX1-NEXT: # %bb.44: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_45: -; AVX1-NEXT: vpextrb $10, %xmm3, %ecx -; AVX1-NEXT: vpextrb $10, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: jno .LBB2_47 -; AVX1-NEXT: # %bb.46: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_47: -; AVX1-NEXT: vpextrb $9, %xmm3, %ecx -; AVX1-NEXT: vpextrb $9, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: jno .LBB2_49 -; AVX1-NEXT: # %bb.48: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_49: -; AVX1-NEXT: vpextrb $8, %xmm3, %ecx -; AVX1-NEXT: vpextrb $8, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jo .LBB2_50 -; AVX1-NEXT: # %bb.51: -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_52 -; AVX1-NEXT: .LBB2_50: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_52: -; AVX1-NEXT: vpextrb $7, %xmm3, %ecx -; AVX1-NEXT: vpextrb $7, %xmm1, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r11b -; AVX1-NEXT: jno .LBB2_54 -; AVX1-NEXT: # %bb.53: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_54: -; AVX1-NEXT: vpextrb $6, %xmm3, %ecx -; AVX1-NEXT: vpextrb $6, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_56 -; AVX1-NEXT: # %bb.55: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_56: -; AVX1-NEXT: vpextrb $5, %xmm3, %ecx -; AVX1-NEXT: vpextrb $5, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB2_58 -; AVX1-NEXT: # %bb.57: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_58: -; AVX1-NEXT: vpextrb $4, %xmm3, %ecx -; AVX1-NEXT: vpextrb $4, %xmm1, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r13b -; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_60 -; AVX1-NEXT: # %bb.59: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_60: -; AVX1-NEXT: vpextrb $3, %xmm3, %ecx -; AVX1-NEXT: vpextrb $3, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jo .LBB2_61 -; AVX1-NEXT: # %bb.62: -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_63 -; AVX1-NEXT: .LBB2_61: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_63: -; AVX1-NEXT: vpextrb $2, %xmm3, %ecx -; AVX1-NEXT: vpextrb $2, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_65 -; AVX1-NEXT: # %bb.64: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_65: -; AVX1-NEXT: vpextrb $0, %xmm3, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_67 -; AVX1-NEXT: # %bb.66: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_67: -; AVX1-NEXT: vpextrb $1, %xmm3, %ecx -; AVX1-NEXT: vpextrb $1, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_69 -; AVX1-NEXT: # %bb.68: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_69: -; AVX1-NEXT: vpextrb $15, %xmm2, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_71 -; AVX1-NEXT: # %bb.70: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_71: -; AVX1-NEXT: vpextrb $14, %xmm2, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB2_73 -; AVX1-NEXT: # %bb.72: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_73: -; AVX1-NEXT: vpextrb $13, %xmm2, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r10b -; AVX1-NEXT: jno .LBB2_75 -; AVX1-NEXT: # %bb.74: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB2_75: -; AVX1-NEXT: vpextrb $12, %xmm2, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r12b -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_77 -; AVX1-NEXT: # %bb.76: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_77: -; AVX1-NEXT: vpextrb $11, %xmm2, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r14b -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_79 -; AVX1-NEXT: # %bb.78: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB2_79: -; AVX1-NEXT: vpextrb $10, %xmm2, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r13b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_81 -; AVX1-NEXT: # %bb.80: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB2_81: -; AVX1-NEXT: vpextrb $9, %xmm2, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r8b -; AVX1-NEXT: jno .LBB2_83 -; AVX1-NEXT: # %bb.82: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB2_83: -; AVX1-NEXT: vpextrb $8, %xmm2, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r15b -; AVX1-NEXT: jno .LBB2_85 -; AVX1-NEXT: # %bb.84: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB2_85: -; AVX1-NEXT: vpextrb $7, %xmm2, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r12b -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_87 -; AVX1-NEXT: # %bb.86: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_87: -; AVX1-NEXT: vpextrb $6, %xmm2, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r12b -; AVX1-NEXT: jno .LBB2_89 -; AVX1-NEXT: # %bb.88: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB2_89: -; AVX1-NEXT: vpextrb $5, %xmm2, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB2_91 -; AVX1-NEXT: # %bb.90: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_91: -; AVX1-NEXT: vpextrb $4, %xmm2, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %sil -; AVX1-NEXT: jno .LBB2_93 -; AVX1-NEXT: # %bb.92: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_93: -; AVX1-NEXT: vpextrb $3, %xmm2, %ecx -; AVX1-NEXT: vpextrb $3, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_95 -; AVX1-NEXT: # %bb.94: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_95: -; AVX1-NEXT: vpextrb $2, %xmm2, %ecx -; AVX1-NEXT: vpextrb $2, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB2_97 -; AVX1-NEXT: # %bb.96: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_97: -; AVX1-NEXT: vpextrb $0, %xmm2, %ecx -; AVX1-NEXT: vpextrb $0, %xmm0, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bl -; AVX1-NEXT: jno .LBB2_99 -; AVX1-NEXT: # %bb.98: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_99: -; AVX1-NEXT: vpextrb $1, %xmm2, %ecx -; AVX1-NEXT: vpextrb $1, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r11b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_101 -; AVX1-NEXT: # %bb.100: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_101: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r11b -; AVX1-NEXT: jno .LBB2_103 -; AVX1-NEXT: # %bb.102: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_103: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r9b -; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_105 -; AVX1-NEXT: # %bb.104: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_105: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r9b -; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_107 -; AVX1-NEXT: # %bb.106: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: .LBB2_107: -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_109 -; AVX1-NEXT: # %bb.108: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_109: -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r13b -; AVX1-NEXT: jno .LBB2_111 -; AVX1-NEXT: # %bb.110: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB2_111: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r15b -; AVX1-NEXT: jno .LBB2_113 -; AVX1-NEXT: # %bb.112: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB2_113: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r14b -; AVX1-NEXT: jno .LBB2_115 -; AVX1-NEXT: # %bb.114: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB2_115: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB2_117 -; AVX1-NEXT: # %bb.116: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_117: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_119 -; AVX1-NEXT: # %bb.118: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_119: -; AVX1-NEXT: vpextrb $6, %xmm1, %edx -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addb %dl, %al -; AVX1-NEXT: jno .LBB2_121 -; AVX1-NEXT: # %bb.120: -; AVX1-NEXT: addb $127, %cl -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB2_121: -; AVX1-NEXT: vpextrb $5, %xmm1, %ebx -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: addb %bl, %dl -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: jno .LBB2_123 -; AVX1-NEXT: # %bb.122: -; AVX1-NEXT: addb $127, %dl -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: .LBB2_123: -; AVX1-NEXT: vpextrb $4, %xmm1, %esi -; AVX1-NEXT: vpextrb $4, %xmm0, %edx -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %dl -; AVX1-NEXT: jno .LBB2_125 -; AVX1-NEXT: # %bb.124: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: .LBB2_125: -; AVX1-NEXT: vpextrb $3, %xmm1, %esi -; AVX1-NEXT: vpextrb $3, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %r8b -; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_127 -; AVX1-NEXT: # %bb.126: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r8d -; AVX1-NEXT: .LBB2_127: -; AVX1-NEXT: vpextrb $2, %xmm1, %esi -; AVX1-NEXT: vpextrb $2, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %r9b -; AVX1-NEXT: jno .LBB2_129 -; AVX1-NEXT: # %bb.128: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r9d -; AVX1-NEXT: .LBB2_129: -; AVX1-NEXT: vpextrb $0, %xmm1, %esi -; AVX1-NEXT: vpextrb $0, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: movl %r11d, %r12d -; AVX1-NEXT: addb %sil, %r10b -; AVX1-NEXT: jno .LBB2_131 -; AVX1-NEXT: # %bb.130: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r10d -; AVX1-NEXT: .LBB2_131: -; AVX1-NEXT: vpextrb $1, %xmm1, %esi -; AVX1-NEXT: vpextrb $1, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %r11b -; AVX1-NEXT: jno .LBB2_133 -; AVX1-NEXT: # %bb.132: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r11d -; AVX1-NEXT: .LBB2_133: -; AVX1-NEXT: movzbl %r10b, %esi -; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: movzbl %r11b, %esi -; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r9b, %esi -; AVX1-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r8b, %esi -; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dl, %edx -; AVX1-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 -; AVX1-NEXT: addq $76, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddsb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddsb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $76, %rsp -; AVX2-NEXT: vpextrb $15, %xmm3, %ecx -; AVX2-NEXT: vpextrb $15, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: jo .LBB2_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_3 -; AVX2-NEXT: .LBB2_1: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_3: -; AVX2-NEXT: vpextrb $14, %xmm3, %ecx -; AVX2-NEXT: vpextrb $14, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: jno .LBB2_5 -; AVX2-NEXT: # %bb.4: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_5: -; AVX2-NEXT: vpextrb $13, %xmm3, %ecx -; AVX2-NEXT: vpextrb $13, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: jo .LBB2_6 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_8 -; AVX2-NEXT: .LBB2_6: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_8: -; AVX2-NEXT: vpextrb $12, %xmm3, %ecx -; AVX2-NEXT: vpextrb $12, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: jno .LBB2_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_10: -; AVX2-NEXT: vpextrb $11, %xmm3, %ecx -; AVX2-NEXT: vpextrb $11, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB2_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_12: -; AVX2-NEXT: vpextrb $10, %xmm3, %ecx -; AVX2-NEXT: vpextrb $10, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB2_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_14: -; AVX2-NEXT: vpextrb $9, %xmm3, %ecx -; AVX2-NEXT: vpextrb $9, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jo .LBB2_15 -; AVX2-NEXT: # %bb.16: -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_17 -; AVX2-NEXT: .LBB2_15: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_17: -; AVX2-NEXT: vpextrb $8, %xmm3, %ecx -; AVX2-NEXT: vpextrb $8, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB2_19 -; AVX2-NEXT: # %bb.18: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_19: -; AVX2-NEXT: vpextrb $7, %xmm3, %ecx -; AVX2-NEXT: vpextrb $7, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_21 -; AVX2-NEXT: # %bb.20: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_21: -; AVX2-NEXT: vpextrb $6, %xmm3, %ecx -; AVX2-NEXT: vpextrb $6, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_23 -; AVX2-NEXT: # %bb.22: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_23: -; AVX2-NEXT: vpextrb $5, %xmm3, %ecx -; AVX2-NEXT: vpextrb $5, %xmm1, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r11b -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_25 -; AVX2-NEXT: # %bb.24: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_25: -; AVX2-NEXT: vpextrb $4, %xmm3, %ecx -; AVX2-NEXT: vpextrb $4, %xmm1, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r13b -; AVX2-NEXT: jno .LBB2_27 -; AVX2-NEXT: # %bb.26: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB2_27: -; AVX2-NEXT: vpextrb $3, %xmm3, %ecx -; AVX2-NEXT: vpextrb $3, %xmm1, %r8d -; AVX2-NEXT: movl %r8d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r8b -; AVX2-NEXT: jno .LBB2_29 -; AVX2-NEXT: # %bb.28: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB2_29: -; AVX2-NEXT: vpextrb $2, %xmm3, %ecx -; AVX2-NEXT: vpextrb $2, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB2_31 -; AVX2-NEXT: # %bb.30: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_31: -; AVX2-NEXT: vpextrb $0, %xmm3, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: jno .LBB2_33 -; AVX2-NEXT: # %bb.32: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_33: -; AVX2-NEXT: vpextrb $1, %xmm3, %ecx -; AVX2-NEXT: vpextrb $1, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB2_35 -; AVX2-NEXT: # %bb.34: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_35: -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpextrb $15, %xmm3, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_37 -; AVX2-NEXT: # %bb.36: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_37: -; AVX2-NEXT: vpextrb $14, %xmm3, %ecx -; AVX2-NEXT: vpextrb $14, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_39 -; AVX2-NEXT: # %bb.38: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_39: -; AVX2-NEXT: vpextrb $13, %xmm3, %ecx -; AVX2-NEXT: vpextrb $13, %xmm1, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r12b -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_41 -; AVX2-NEXT: # %bb.40: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB2_41: -; AVX2-NEXT: vpextrb $12, %xmm3, %ecx -; AVX2-NEXT: vpextrb $12, %xmm1, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r15b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_43 -; AVX2-NEXT: # %bb.42: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB2_43: -; AVX2-NEXT: vpextrb $11, %xmm3, %ecx -; AVX2-NEXT: vpextrb $11, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_45 -; AVX2-NEXT: # %bb.44: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_45: -; AVX2-NEXT: vpextrb $10, %xmm3, %ecx -; AVX2-NEXT: vpextrb $10, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: jno .LBB2_47 -; AVX2-NEXT: # %bb.46: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_47: -; AVX2-NEXT: vpextrb $9, %xmm3, %ecx -; AVX2-NEXT: vpextrb $9, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: jno .LBB2_49 -; AVX2-NEXT: # %bb.48: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_49: -; AVX2-NEXT: vpextrb $8, %xmm3, %ecx -; AVX2-NEXT: vpextrb $8, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jo .LBB2_50 -; AVX2-NEXT: # %bb.51: -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_52 -; AVX2-NEXT: .LBB2_50: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_52: -; AVX2-NEXT: vpextrb $7, %xmm3, %ecx -; AVX2-NEXT: vpextrb $7, %xmm1, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r11b -; AVX2-NEXT: jno .LBB2_54 -; AVX2-NEXT: # %bb.53: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_54: -; AVX2-NEXT: vpextrb $6, %xmm3, %ecx -; AVX2-NEXT: vpextrb $6, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_56 -; AVX2-NEXT: # %bb.55: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_56: -; AVX2-NEXT: vpextrb $5, %xmm3, %ecx -; AVX2-NEXT: vpextrb $5, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB2_58 -; AVX2-NEXT: # %bb.57: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_58: -; AVX2-NEXT: vpextrb $4, %xmm3, %ecx -; AVX2-NEXT: vpextrb $4, %xmm1, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r13b -; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_60 -; AVX2-NEXT: # %bb.59: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_60: -; AVX2-NEXT: vpextrb $3, %xmm3, %ecx -; AVX2-NEXT: vpextrb $3, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jo .LBB2_61 -; AVX2-NEXT: # %bb.62: -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_63 -; AVX2-NEXT: .LBB2_61: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_63: -; AVX2-NEXT: vpextrb $2, %xmm3, %ecx -; AVX2-NEXT: vpextrb $2, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_65 -; AVX2-NEXT: # %bb.64: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_65: -; AVX2-NEXT: vpextrb $0, %xmm3, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_67 -; AVX2-NEXT: # %bb.66: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_67: -; AVX2-NEXT: vpextrb $1, %xmm3, %ecx -; AVX2-NEXT: vpextrb $1, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_69 -; AVX2-NEXT: # %bb.68: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_69: -; AVX2-NEXT: vpextrb $15, %xmm2, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_71 -; AVX2-NEXT: # %bb.70: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_71: -; AVX2-NEXT: vpextrb $14, %xmm2, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB2_73 -; AVX2-NEXT: # %bb.72: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_73: -; AVX2-NEXT: vpextrb $13, %xmm2, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r10b -; AVX2-NEXT: jno .LBB2_75 -; AVX2-NEXT: # %bb.74: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB2_75: -; AVX2-NEXT: vpextrb $12, %xmm2, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r12b -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_77 -; AVX2-NEXT: # %bb.76: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_77: -; AVX2-NEXT: vpextrb $11, %xmm2, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r14b -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_79 -; AVX2-NEXT: # %bb.78: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB2_79: -; AVX2-NEXT: vpextrb $10, %xmm2, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r13b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_81 -; AVX2-NEXT: # %bb.80: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB2_81: -; AVX2-NEXT: vpextrb $9, %xmm2, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r8b -; AVX2-NEXT: jno .LBB2_83 -; AVX2-NEXT: # %bb.82: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB2_83: -; AVX2-NEXT: vpextrb $8, %xmm2, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r15b -; AVX2-NEXT: jno .LBB2_85 -; AVX2-NEXT: # %bb.84: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB2_85: -; AVX2-NEXT: vpextrb $7, %xmm2, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r12b -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_87 -; AVX2-NEXT: # %bb.86: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_87: -; AVX2-NEXT: vpextrb $6, %xmm2, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r12b -; AVX2-NEXT: jno .LBB2_89 -; AVX2-NEXT: # %bb.88: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB2_89: -; AVX2-NEXT: vpextrb $5, %xmm2, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB2_91 -; AVX2-NEXT: # %bb.90: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_91: -; AVX2-NEXT: vpextrb $4, %xmm2, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %sil -; AVX2-NEXT: jno .LBB2_93 -; AVX2-NEXT: # %bb.92: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_93: -; AVX2-NEXT: vpextrb $3, %xmm2, %ecx -; AVX2-NEXT: vpextrb $3, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_95 -; AVX2-NEXT: # %bb.94: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_95: -; AVX2-NEXT: vpextrb $2, %xmm2, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB2_97 -; AVX2-NEXT: # %bb.96: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_97: -; AVX2-NEXT: vpextrb $0, %xmm2, %ecx -; AVX2-NEXT: vpextrb $0, %xmm0, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bl -; AVX2-NEXT: jno .LBB2_99 -; AVX2-NEXT: # %bb.98: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_99: -; AVX2-NEXT: vpextrb $1, %xmm2, %ecx -; AVX2-NEXT: vpextrb $1, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r11b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_101 -; AVX2-NEXT: # %bb.100: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_101: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r11b -; AVX2-NEXT: jno .LBB2_103 -; AVX2-NEXT: # %bb.102: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_103: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r9b -; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_105 -; AVX2-NEXT: # %bb.104: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_105: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r9b -; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_107 -; AVX2-NEXT: # %bb.106: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: .LBB2_107: -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_109 -; AVX2-NEXT: # %bb.108: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_109: -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r13b -; AVX2-NEXT: jno .LBB2_111 -; AVX2-NEXT: # %bb.110: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB2_111: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r15b -; AVX2-NEXT: jno .LBB2_113 -; AVX2-NEXT: # %bb.112: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB2_113: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r14b -; AVX2-NEXT: jno .LBB2_115 -; AVX2-NEXT: # %bb.114: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB2_115: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB2_117 -; AVX2-NEXT: # %bb.116: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_117: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_119 -; AVX2-NEXT: # %bb.118: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_119: -; AVX2-NEXT: vpextrb $6, %xmm1, %edx -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addb %dl, %al -; AVX2-NEXT: jno .LBB2_121 -; AVX2-NEXT: # %bb.120: -; AVX2-NEXT: addb $127, %cl -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB2_121: -; AVX2-NEXT: vpextrb $5, %xmm1, %ebx -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: addb %bl, %dl -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: jno .LBB2_123 -; AVX2-NEXT: # %bb.122: -; AVX2-NEXT: addb $127, %dl -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: .LBB2_123: -; AVX2-NEXT: vpextrb $4, %xmm1, %esi -; AVX2-NEXT: vpextrb $4, %xmm0, %edx -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %dl -; AVX2-NEXT: jno .LBB2_125 -; AVX2-NEXT: # %bb.124: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: .LBB2_125: -; AVX2-NEXT: vpextrb $3, %xmm1, %esi -; AVX2-NEXT: vpextrb $3, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %r8b -; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_127 -; AVX2-NEXT: # %bb.126: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r8d -; AVX2-NEXT: .LBB2_127: -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: vpextrb $2, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %r9b -; AVX2-NEXT: jno .LBB2_129 -; AVX2-NEXT: # %bb.128: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r9d -; AVX2-NEXT: .LBB2_129: -; AVX2-NEXT: vpextrb $0, %xmm1, %esi -; AVX2-NEXT: vpextrb $0, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: movl %r11d, %r12d -; AVX2-NEXT: addb %sil, %r10b -; AVX2-NEXT: jno .LBB2_131 -; AVX2-NEXT: # %bb.130: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r10d -; AVX2-NEXT: .LBB2_131: -; AVX2-NEXT: vpextrb $1, %xmm1, %esi -; AVX2-NEXT: vpextrb $1, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %r11b -; AVX2-NEXT: jno .LBB2_133 -; AVX2-NEXT: # %bb.132: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r11d -; AVX2-NEXT: .LBB2_133: -; AVX2-NEXT: movzbl %r10b, %esi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: movzbl %r11b, %esi -; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r9b, %esi -; AVX2-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r8b, %esi -; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX2-NEXT: addq $76, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpaddsb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddsb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $76, %rsp -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: jo .LBB2_1 -; AVX512-NEXT: # %bb.2: -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_3 -; AVX512-NEXT: .LBB2_1: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_3: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: jno .LBB2_5 -; AVX512-NEXT: # %bb.4: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_5: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: jo .LBB2_6 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_8 -; AVX512-NEXT: .LBB2_6: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_8: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: jno .LBB2_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_10: -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB2_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_12: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB2_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_14: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jo .LBB2_15 -; AVX512-NEXT: # %bb.16: -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_17 -; AVX512-NEXT: .LBB2_15: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_17: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB2_19 -; AVX512-NEXT: # %bb.18: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_19: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_21 -; AVX512-NEXT: # %bb.20: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_21: -; AVX512-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512-NEXT: vpextrb $6, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_23 -; AVX512-NEXT: # %bb.22: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_23: -; AVX512-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512-NEXT: vpextrb $5, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r11b -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_25 -; AVX512-NEXT: # %bb.24: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_25: -; AVX512-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512-NEXT: vpextrb $4, %xmm0, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r13b -; AVX512-NEXT: jno .LBB2_27 -; AVX512-NEXT: # %bb.26: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB2_27: -; AVX512-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512-NEXT: vpextrb $3, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r8b -; AVX512-NEXT: jno .LBB2_29 -; AVX512-NEXT: # %bb.28: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB2_29: -; AVX512-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512-NEXT: vpextrb $2, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB2_31 -; AVX512-NEXT: # %bb.30: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_31: -; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: jno .LBB2_33 -; AVX512-NEXT: # %bb.32: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_33: -; AVX512-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512-NEXT: vpextrb $1, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB2_35 -; AVX512-NEXT: # %bb.34: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_35: -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrb $15, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_37 -; AVX512-NEXT: # %bb.36: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_37: -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_39 -; AVX512-NEXT: # %bb.38: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_39: -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r12b -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_41 -; AVX512-NEXT: # %bb.40: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: .LBB2_41: -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r15b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_43 -; AVX512-NEXT: # %bb.42: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB2_43: -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_45 -; AVX512-NEXT: # %bb.44: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_45: -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: jno .LBB2_47 -; AVX512-NEXT: # %bb.46: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_47: -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: jno .LBB2_49 -; AVX512-NEXT: # %bb.48: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_49: -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jo .LBB2_50 -; AVX512-NEXT: # %bb.51: -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_52 -; AVX512-NEXT: .LBB2_50: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_52: -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r11b -; AVX512-NEXT: jno .LBB2_54 -; AVX512-NEXT: # %bb.53: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_54: -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_56 -; AVX512-NEXT: # %bb.55: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_56: -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB2_58 -; AVX512-NEXT: # %bb.57: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_58: -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r13b -; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_60 -; AVX512-NEXT: # %bb.59: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_60: -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jo .LBB2_61 -; AVX512-NEXT: # %bb.62: -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_63 -; AVX512-NEXT: .LBB2_61: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_63: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_65 -; AVX512-NEXT: # %bb.64: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_65: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_67 -; AVX512-NEXT: # %bb.66: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_67: -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vpextrb $1, %xmm3, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_69 -; AVX512-NEXT: # %bb.68: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_69: -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vpextrb $15, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_71 -; AVX512-NEXT: # %bb.70: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_71: -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB2_73 -; AVX512-NEXT: # %bb.72: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_73: -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %r10d -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r10b -; AVX512-NEXT: jno .LBB2_75 -; AVX512-NEXT: # %bb.74: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r10d -; AVX512-NEXT: .LBB2_75: -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r12b -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_77 -; AVX512-NEXT: # %bb.76: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_77: -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %r14d -; AVX512-NEXT: movl %r14d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r14b -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_79 -; AVX512-NEXT: # %bb.78: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: .LBB2_79: -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r13b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_81 -; AVX512-NEXT: # %bb.80: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB2_81: -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r8b -; AVX512-NEXT: jno .LBB2_83 -; AVX512-NEXT: # %bb.82: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB2_83: -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r15b -; AVX512-NEXT: jno .LBB2_85 -; AVX512-NEXT: # %bb.84: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB2_85: -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r12b -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_87 -; AVX512-NEXT: # %bb.86: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_87: -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r12b -; AVX512-NEXT: jno .LBB2_89 -; AVX512-NEXT: # %bb.88: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: .LBB2_89: -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB2_91 -; AVX512-NEXT: # %bb.90: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_91: -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %sil -; AVX512-NEXT: jno .LBB2_93 -; AVX512-NEXT: # %bb.92: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_93: -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_95 -; AVX512-NEXT: # %bb.94: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_95: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: jno .LBB2_97 -; AVX512-NEXT: # %bb.96: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_97: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bl -; AVX512-NEXT: jno .LBB2_99 -; AVX512-NEXT: # %bb.98: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_99: -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vpextrb $1, %xmm3, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r11b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_101 -; AVX512-NEXT: # %bb.100: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_101: -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vpextrb $15, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r11b -; AVX512-NEXT: jno .LBB2_103 -; AVX512-NEXT: # %bb.102: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_103: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r9b -; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_105 -; AVX512-NEXT: # %bb.104: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_105: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r9b -; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_107 -; AVX512-NEXT: # %bb.106: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r9d -; AVX512-NEXT: .LBB2_107: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_109 -; AVX512-NEXT: # %bb.108: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_109: -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r13b -; AVX512-NEXT: jno .LBB2_111 -; AVX512-NEXT: # %bb.110: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB2_111: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r15b -; AVX512-NEXT: jno .LBB2_113 -; AVX512-NEXT: # %bb.112: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB2_113: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %r14d -; AVX512-NEXT: movl %r14d, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %r14b -; AVX512-NEXT: jno .LBB2_115 -; AVX512-NEXT: # %bb.114: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: .LBB2_115: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %bpl -; AVX512-NEXT: jno .LBB2_117 -; AVX512-NEXT: # %bb.116: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_117: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: addb %cl, %dil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_119 -; AVX512-NEXT: # %bb.118: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_119: -; AVX512-NEXT: vpextrb $6, %xmm1, %edx -; AVX512-NEXT: vpextrb $6, %xmm0, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %dl, %al -; AVX512-NEXT: jno .LBB2_121 -; AVX512-NEXT: # %bb.120: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB2_121: -; AVX512-NEXT: vpextrb $5, %xmm1, %ebx -; AVX512-NEXT: vpextrb $5, %xmm0, %ecx -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: addb %bl, %dl -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addb %bl, %cl -; AVX512-NEXT: jno .LBB2_123 -; AVX512-NEXT: # %bb.122: -; AVX512-NEXT: addb $127, %dl -; AVX512-NEXT: movl %edx, %ecx -; AVX512-NEXT: .LBB2_123: -; AVX512-NEXT: vpextrb $4, %xmm1, %esi -; AVX512-NEXT: vpextrb $4, %xmm0, %edx -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %dl -; AVX512-NEXT: jno .LBB2_125 -; AVX512-NEXT: # %bb.124: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: .LBB2_125: -; AVX512-NEXT: vpextrb $3, %xmm1, %esi -; AVX512-NEXT: vpextrb $3, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %r8b -; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_127 -; AVX512-NEXT: # %bb.126: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r8d -; AVX512-NEXT: .LBB2_127: -; AVX512-NEXT: vpextrb $2, %xmm1, %esi -; AVX512-NEXT: vpextrb $2, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %r9b -; AVX512-NEXT: jno .LBB2_129 -; AVX512-NEXT: # %bb.128: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r9d -; AVX512-NEXT: .LBB2_129: -; AVX512-NEXT: vpextrb $0, %xmm1, %esi -; AVX512-NEXT: vpextrb $0, %xmm0, %r10d -; AVX512-NEXT: movl %r10d, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: movl %r11d, %r12d -; AVX512-NEXT: addb %sil, %r10b -; AVX512-NEXT: jno .LBB2_131 -; AVX512-NEXT: # %bb.130: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r10d -; AVX512-NEXT: .LBB2_131: -; AVX512-NEXT: vpextrb $1, %xmm1, %esi -; AVX512-NEXT: vpextrb $1, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %r11b -; AVX512-NEXT: jno .LBB2_133 -; AVX512-NEXT: # %bb.132: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r11d -; AVX512-NEXT: .LBB2_133: -; AVX512-NEXT: movzbl %r10b, %esi -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: movzbl %r11b, %esi -; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r9b, %esi -; AVX512-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r8b, %esi -; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dl, %edx -; AVX512-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %cl, %ecx -; AVX512-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dil, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %bpl, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r14b, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r15b, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r13b, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r12b, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: addq $76, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpaddsb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { -; SSE2-LABEL: v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movd %xmm0, %r8d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r8d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r8w -; SSE2-NEXT: cmovol %ecx, %r8d -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %edi -; SSE2-NEXT: addw %ax, %di -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %edi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %di -; SSE2-NEXT: cmovol %ecx, %edi -; SSE2-NEXT: pextrw $5, %xmm1, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: addw %cx, %ax -; SSE2-NEXT: cmovol %edx, %eax -; SSE2-NEXT: pextrw $6, %xmm1, %edx -; SSE2-NEXT: pextrw $6, %xmm0, %ecx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: addw %dx, %bx -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: addw %dx, %cx -; SSE2-NEXT: cmovol %esi, %ecx -; SSE2-NEXT: pextrw $7, %xmm1, %edx -; SSE2-NEXT: pextrw $7, %xmm0, %esi -; SSE2-NEXT: xorl %ebx, %ebx -; SSE2-NEXT: movl %esi, %ebp -; SSE2-NEXT: addw %dx, %bp -; SSE2-NEXT: setns %bl -; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE2-NEXT: addw %dx, %si -; SSE2-NEXT: cmovol %ebx, %esi -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %r11d, %xmm0 -; SSE2-NEXT: movd %r10d, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movd %xmm0, %r8d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r8d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r8w -; SSSE3-NEXT: cmovol %ecx, %r8d -; SSSE3-NEXT: pextrw $1, %xmm1, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: pextrw $2, %xmm1, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: pextrw $3, %xmm1, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %edi -; SSSE3-NEXT: addw %ax, %di -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: pextrw $4, %xmm1, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %edi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %di -; SSSE3-NEXT: cmovol %ecx, %edi -; SSSE3-NEXT: pextrw $5, %xmm1, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: addw %cx, %ax -; SSSE3-NEXT: cmovol %edx, %eax -; SSSE3-NEXT: pextrw $6, %xmm1, %edx -; SSSE3-NEXT: pextrw $6, %xmm0, %ecx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: addw %dx, %bx -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: addw %dx, %cx -; SSSE3-NEXT: cmovol %esi, %ecx -; SSSE3-NEXT: pextrw $7, %xmm1, %edx -; SSSE3-NEXT: pextrw $7, %xmm0, %esi -; SSSE3-NEXT: xorl %ebx, %ebx -; SSSE3-NEXT: movl %esi, %ebp -; SSSE3-NEXT: addw %dx, %bp -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSSE3-NEXT: addw %dx, %si -; SSSE3-NEXT: cmovol %ebx, %esi -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %r11d, %xmm0 -; SSSE3-NEXT: movd %r10d, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrw $7, %xmm1, %eax -; SSE41-NEXT: pextrw $7, %xmm0, %r8d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r8d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r8w -; SSE41-NEXT: cmovol %ecx, %r8d -; SSE41-NEXT: pextrw $6, %xmm1, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %r9d -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %r9d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r9w -; SSE41-NEXT: cmovol %edx, %r9d -; SSE41-NEXT: pextrw $5, %xmm1, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %r10d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r10d, %edi -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r10w -; SSE41-NEXT: cmovol %esi, %r10d -; SSE41-NEXT: pextrw $4, %xmm1, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %r11d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r11d, %ecx -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r11w -; SSE41-NEXT: cmovol %edi, %r11d -; SSE41-NEXT: pextrw $3, %xmm1, %eax -; SSE41-NEXT: pextrw $3, %xmm0, %edi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: cmovol %ecx, %edi -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %eax -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: addw %cx, %ax -; SSE41-NEXT: cmovol %edx, %eax -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: movd %xmm0, %edx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: addw %cx, %bx -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: addw %cx, %dx -; SSE41-NEXT: cmovol %esi, %edx -; SSE41-NEXT: pextrw $1, %xmm1, %ecx -; SSE41-NEXT: pextrw $1, %xmm0, %esi -; SSE41-NEXT: xorl %ebx, %ebx -; SSE41-NEXT: movl %esi, %ebp -; SSE41-NEXT: addw %cx, %bp -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE41-NEXT: addw %cx, %si -; SSE41-NEXT: cmovol %ebx, %esi -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrw $1, %esi, %xmm0 -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 -; SSE41-NEXT: pinsrw $3, %edi, %xmm0 -; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 -; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 -; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 -; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v8i16: +; SSE: # %bb.0: +; SSE-NEXT: paddsw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v8i16: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrw $7, %xmm1, %eax -; AVX-NEXT: vpextrw $7, %xmm0, %r8d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r8d, %edx -; AVX-NEXT: addw %ax, %dx -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r8w -; AVX-NEXT: cmovol %ecx, %r8d -; AVX-NEXT: vpextrw $6, %xmm1, %eax -; AVX-NEXT: vpextrw $6, %xmm0, %r9d -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: movl %r9d, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %dl -; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r9w -; AVX-NEXT: cmovol %edx, %r9d -; AVX-NEXT: vpextrw $5, %xmm1, %eax -; AVX-NEXT: vpextrw $5, %xmm0, %r10d -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %r10d, %edi -; AVX-NEXT: addw %ax, %di -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r10w -; AVX-NEXT: cmovol %esi, %r10d -; AVX-NEXT: vpextrw $4, %xmm1, %eax -; AVX-NEXT: vpextrw $4, %xmm0, %r11d -; AVX-NEXT: xorl %edi, %edi -; AVX-NEXT: movl %r11d, %ecx -; AVX-NEXT: addw %ax, %cx -; AVX-NEXT: setns %dil -; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r11w -; AVX-NEXT: cmovol %edi, %r11d -; AVX-NEXT: vpextrw $3, %xmm1, %eax -; AVX-NEXT: vpextrw $3, %xmm0, %edi -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: addw %ax, %dx -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %di -; AVX-NEXT: cmovol %ecx, %edi -; AVX-NEXT: vpextrw $2, %xmm1, %ecx -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: movl %eax, %esi -; AVX-NEXT: addw %cx, %si -; AVX-NEXT: setns %dl -; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX-NEXT: addw %cx, %ax -; AVX-NEXT: cmovol %edx, %eax -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: addw %cx, %bx -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: addw %cx, %dx -; AVX-NEXT: cmovol %esi, %edx -; AVX-NEXT: vpextrw $1, %xmm1, %ecx -; AVX-NEXT: vpextrw $1, %xmm0, %esi -; AVX-NEXT: xorl %ebx, %ebx -; AVX-NEXT: movl %esi, %ebp -; AVX-NEXT: addw %cx, %bp -; AVX-NEXT: setns %bl -; AVX-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX-NEXT: addw %cx, %si -; AVX-NEXT: cmovol %ebx, %esi -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %rbp +; AVX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z } define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { -; SSE2-LABEL: v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm3, %eax -; SSE2-NEXT: pextrw $1, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: pextrw $2, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm3, %eax -; SSE2-NEXT: pextrw $3, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: pextrw $4, %xmm1, %r14d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r14d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r14w -; SSE2-NEXT: cmovol %ecx, %r14d -; SSE2-NEXT: pextrw $5, %xmm3, %eax -; SSE2-NEXT: pextrw $5, %xmm1, %r15d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r15d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r15w -; SSE2-NEXT: cmovol %ecx, %r15d -; SSE2-NEXT: pextrw $6, %xmm3, %eax -; SSE2-NEXT: pextrw $6, %xmm1, %r12d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r12d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r12w -; SSE2-NEXT: cmovol %ecx, %r12d -; SSE2-NEXT: pextrw $7, %xmm3, %eax -; SSE2-NEXT: pextrw $7, %xmm1, %r13d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r13d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r13w -; SSE2-NEXT: cmovol %ecx, %r13d -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: pextrw $1, %xmm2, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %ebx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %bx -; SSE2-NEXT: cmovol %ecx, %ebx -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %ebp -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebp, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %bp -; SSE2-NEXT: cmovol %ecx, %ebp -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %edi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %di -; SSE2-NEXT: cmovol %ecx, %edi -; SSE2-NEXT: pextrw $4, %xmm2, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: addw %cx, %r8w -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: addw %cx, %ax -; SSE2-NEXT: cmovol %edx, %eax -; SSE2-NEXT: pextrw $5, %xmm2, %r8d -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %ecx, %r9d -; SSE2-NEXT: addw %r8w, %r9w -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: addw %r8w, %cx -; SSE2-NEXT: cmovol %edx, %ecx -; SSE2-NEXT: pextrw $6, %xmm2, %r8d -; SSE2-NEXT: pextrw $6, %xmm0, %r9d -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %r9d, %r10d -; SSE2-NEXT: addw %r8w, %r10w -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: addw %r8w, %r9w -; SSE2-NEXT: cmovol %edx, %r9d -; SSE2-NEXT: pextrw $7, %xmm2, %r8d -; SSE2-NEXT: pextrw $7, %xmm0, %edx -; SSE2-NEXT: xorl %r10d, %r10d -; SSE2-NEXT: movl %edx, %r11d -; SSE2-NEXT: addw %r8w, %r11w -; SSE2-NEXT: setns %r10b -; SSE2-NEXT: addl $32767, %r10d # imm = 0x7FFF -; SSE2-NEXT: addw %r8w, %dx -; SSE2-NEXT: cmovol %r10d, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movd %ecx, %xmm9 -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movd %edi, %xmm10 -; SSE2-NEXT: movd %ebp, %xmm7 -; SSE2-NEXT: movd %ebx, %xmm11 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %r13d, %xmm12 -; SSE2-NEXT: movd %r12d, %xmm6 -; SSE2-NEXT: movd %r15d, %xmm13 -; SSE2-NEXT: movd %r14d, %xmm5 -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm3, %eax -; SSSE3-NEXT: pextrw $1, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm3, %eax -; SSSE3-NEXT: pextrw $2, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm3, %eax -; SSSE3-NEXT: pextrw $3, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm3, %eax -; SSSE3-NEXT: pextrw $4, %xmm1, %r14d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r14d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r14w -; SSSE3-NEXT: cmovol %ecx, %r14d -; SSSE3-NEXT: pextrw $5, %xmm3, %eax -; SSSE3-NEXT: pextrw $5, %xmm1, %r15d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r15d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r15w -; SSSE3-NEXT: cmovol %ecx, %r15d -; SSSE3-NEXT: pextrw $6, %xmm3, %eax -; SSSE3-NEXT: pextrw $6, %xmm1, %r12d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r12d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r12w -; SSSE3-NEXT: cmovol %ecx, %r12d -; SSSE3-NEXT: pextrw $7, %xmm3, %eax -; SSSE3-NEXT: pextrw $7, %xmm1, %r13d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r13d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r13w -; SSSE3-NEXT: cmovol %ecx, %r13d -; SSSE3-NEXT: movd %xmm2, %eax -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %ebx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %bx -; SSSE3-NEXT: cmovol %ecx, %ebx -; SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %ebp -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebp, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %bp -; SSSE3-NEXT: cmovol %ecx, %ebp -; SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %edi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %di -; SSSE3-NEXT: cmovol %ecx, %edi -; SSSE3-NEXT: pextrw $4, %xmm2, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: addw %cx, %r8w -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: addw %cx, %ax -; SSSE3-NEXT: cmovol %edx, %eax -; SSSE3-NEXT: pextrw $5, %xmm2, %r8d -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %ecx, %r9d -; SSSE3-NEXT: addw %r8w, %r9w -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: addw %r8w, %cx -; SSSE3-NEXT: cmovol %edx, %ecx -; SSSE3-NEXT: pextrw $6, %xmm2, %r8d -; SSSE3-NEXT: pextrw $6, %xmm0, %r9d -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %r9d, %r10d -; SSSE3-NEXT: addw %r8w, %r10w -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: addw %r8w, %r9w -; SSSE3-NEXT: cmovol %edx, %r9d -; SSSE3-NEXT: pextrw $7, %xmm2, %r8d -; SSSE3-NEXT: pextrw $7, %xmm0, %edx -; SSSE3-NEXT: xorl %r10d, %r10d -; SSSE3-NEXT: movl %edx, %r11d -; SSSE3-NEXT: addw %r8w, %r11w -; SSSE3-NEXT: setns %r10b -; SSSE3-NEXT: addl $32767, %r10d # imm = 0x7FFF -; SSSE3-NEXT: addw %r8w, %dx -; SSSE3-NEXT: cmovol %r10d, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movd %ecx, %xmm9 -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm10 -; SSSE3-NEXT: movd %ebp, %xmm7 -; SSSE3-NEXT: movd %ebx, %xmm11 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %r13d, %xmm12 -; SSSE3-NEXT: movd %r12d, %xmm6 -; SSSE3-NEXT: movd %r15d, %xmm13 -; SSSE3-NEXT: movd %r14d, %xmm5 -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrw $7, %xmm3, %eax -; SSE41-NEXT: pextrw $7, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm3, %eax -; SSE41-NEXT: pextrw $6, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm3, %eax -; SSE41-NEXT: pextrw $5, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm3, %eax -; SSE41-NEXT: pextrw $4, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm3, %eax -; SSE41-NEXT: pextrw $3, %xmm1, %r14d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r14d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r14w -; SSE41-NEXT: cmovol %ecx, %r14d -; SSE41-NEXT: pextrw $2, %xmm3, %eax -; SSE41-NEXT: pextrw $2, %xmm1, %r15d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r15d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r15w -; SSE41-NEXT: cmovol %ecx, %r15d -; SSE41-NEXT: movd %xmm3, %eax -; SSE41-NEXT: movd %xmm1, %r12d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r12d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r12w -; SSE41-NEXT: cmovol %ecx, %r12d -; SSE41-NEXT: pextrw $1, %xmm3, %eax -; SSE41-NEXT: pextrw $1, %xmm1, %r13d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r13d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r13w -; SSE41-NEXT: cmovol %ecx, %r13d -; SSE41-NEXT: pextrw $7, %xmm2, %eax -; SSE41-NEXT: pextrw $7, %xmm0, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edi -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: pextrw $6, %xmm2, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %ebx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebx, %edi -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %bx -; SSE41-NEXT: cmovol %ecx, %ebx -; SSE41-NEXT: pextrw $5, %xmm2, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %ebp -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebp, %edi -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %bp -; SSE41-NEXT: cmovol %ecx, %ebp -; SSE41-NEXT: pextrw $4, %xmm2, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %edi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: cmovol %ecx, %edi -; SSE41-NEXT: pextrw $3, %xmm2, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %eax -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: addw %cx, %r8w -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: addw %cx, %ax -; SSE41-NEXT: cmovol %edx, %eax -; SSE41-NEXT: pextrw $2, %xmm2, %r8d -; SSE41-NEXT: pextrw $2, %xmm0, %ecx -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %ecx, %r9d -; SSE41-NEXT: addw %r8w, %r9w -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: addw %r8w, %cx -; SSE41-NEXT: cmovol %edx, %ecx -; SSE41-NEXT: movd %xmm2, %r8d -; SSE41-NEXT: movd %xmm0, %r9d -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %r9d, %r10d -; SSE41-NEXT: addw %r8w, %r10w -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: addw %r8w, %r9w -; SSE41-NEXT: cmovol %edx, %r9d -; SSE41-NEXT: pextrw $1, %xmm2, %r8d -; SSE41-NEXT: pextrw $1, %xmm0, %edx -; SSE41-NEXT: xorl %r10d, %r10d -; SSE41-NEXT: movl %edx, %r11d -; SSE41-NEXT: addw %r8w, %r11w -; SSE41-NEXT: setns %r10b -; SSE41-NEXT: addl $32767, %r10d # imm = 0x7FFF -; SSE41-NEXT: addw %r8w, %dx -; SSE41-NEXT: cmovol %r10d, %edx -; SSE41-NEXT: movd %r9d, %xmm0 -; SSE41-NEXT: pinsrw $1, %edx, %xmm0 -; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrw $3, %eax, %xmm0 -; SSE41-NEXT: pinsrw $4, %edi, %xmm0 -; SSE41-NEXT: pinsrw $5, %ebp, %xmm0 -; SSE41-NEXT: pinsrw $6, %ebx, %xmm0 -; SSE41-NEXT: pinsrw $7, %esi, %xmm0 -; SSE41-NEXT: movd %r12d, %xmm1 -; SSE41-NEXT: pinsrw $1, %r13d, %xmm1 -; SSE41-NEXT: pinsrw $2, %r15d, %xmm1 -; SSE41-NEXT: pinsrw $3, %r14d, %xmm1 -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i16: +; SSE: # %bb.0: +; SSE-NEXT: paddsw %xmm2, %xmm0 +; SSE-NEXT: paddsw %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: vpextrw $3, %xmm0, %r14d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r14d, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r14w -; AVX1-NEXT: cmovol %ecx, %r14d -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: vpextrw $2, %xmm0, %r15d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r15d, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r15w -; AVX1-NEXT: cmovol %ecx, %r15d -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vmovd %xmm0, %r12d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r12d, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r12w -; AVX1-NEXT: cmovol %ecx, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: vpextrw $1, %xmm0, %r13d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r13d, %esi -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r13w -; AVX1-NEXT: cmovol %ecx, %r13d -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edi -; AVX1-NEXT: addw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %ebx -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %ebx, %edi -; AVX1-NEXT: addw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %bx -; AVX1-NEXT: cmovol %ecx, %ebx -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %ebp -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %ebp, %edi -; AVX1-NEXT: addw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %bp -; AVX1-NEXT: cmovol %ecx, %ebp -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %edi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %edi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %di -; AVX1-NEXT: cmovol %ecx, %edi -; AVX1-NEXT: vpextrw $3, %xmm1, %ecx -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: addw %cx, %r8w -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX1-NEXT: addw %cx, %ax -; AVX1-NEXT: cmovol %edx, %eax -; AVX1-NEXT: vpextrw $2, %xmm1, %r8d -; AVX1-NEXT: vpextrw $2, %xmm0, %ecx -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: movl %ecx, %r9d -; AVX1-NEXT: addw %r8w, %r9w -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX1-NEXT: addw %r8w, %cx -; AVX1-NEXT: cmovol %edx, %ecx -; AVX1-NEXT: vmovd %xmm1, %r8d -; AVX1-NEXT: vmovd %xmm0, %r9d -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: movl %r9d, %r10d -; AVX1-NEXT: addw %r8w, %r10w -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX1-NEXT: addw %r8w, %r9w -; AVX1-NEXT: cmovol %edx, %r9d -; AVX1-NEXT: vpextrw $1, %xmm1, %r8d -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: xorl %r10d, %r10d -; AVX1-NEXT: movl %edx, %r11d -; AVX1-NEXT: addw %r8w, %r11w -; AVX1-NEXT: setns %r10b -; AVX1-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX1-NEXT: addw %r8w, %dx -; AVX1-NEXT: cmovol %r10d, %edx -; AVX1-NEXT: vmovd %r9d, %xmm0 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %r12d, %xmm1 -; AVX1-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: vpextrw $7, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NEXT: vpextrw $3, %xmm0, %r14d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r14d, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r14w -; AVX2-NEXT: cmovol %ecx, %r14d -; AVX2-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-NEXT: vpextrw $2, %xmm0, %r15d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r15d, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r15w -; AVX2-NEXT: cmovol %ecx, %r15d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: vmovd %xmm0, %r12d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r12d, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r12w -; AVX2-NEXT: cmovol %ecx, %r12d -; AVX2-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-NEXT: vpextrw $1, %xmm0, %r13d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r13d, %esi -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r13w -; AVX2-NEXT: cmovol %ecx, %r13d -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: addw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %ebx -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %ebx, %edi -; AVX2-NEXT: addw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %bx -; AVX2-NEXT: cmovol %ecx, %ebx -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %ebp -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %ebp, %edi -; AVX2-NEXT: addw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %bp -; AVX2-NEXT: cmovol %ecx, %ebp -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %edi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %edi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %di -; AVX2-NEXT: cmovol %ecx, %edi -; AVX2-NEXT: vpextrw $3, %xmm1, %ecx -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: addw %cx, %r8w -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX2-NEXT: addw %cx, %ax -; AVX2-NEXT: cmovol %edx, %eax -; AVX2-NEXT: vpextrw $2, %xmm1, %r8d -; AVX2-NEXT: vpextrw $2, %xmm0, %ecx -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: addw %r8w, %r9w -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX2-NEXT: addw %r8w, %cx -; AVX2-NEXT: cmovol %edx, %ecx -; AVX2-NEXT: vmovd %xmm1, %r8d -; AVX2-NEXT: vmovd %xmm0, %r9d -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: movl %r9d, %r10d -; AVX2-NEXT: addw %r8w, %r10w -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX2-NEXT: addw %r8w, %r9w -; AVX2-NEXT: cmovol %edx, %r9d -; AVX2-NEXT: vpextrw $1, %xmm1, %r8d -; AVX2-NEXT: vpextrw $1, %xmm0, %edx -; AVX2-NEXT: xorl %r10d, %r10d -; AVX2-NEXT: movl %edx, %r11d -; AVX2-NEXT: addw %r8w, %r11w -; AVX2-NEXT: setns %r10b -; AVX2-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX2-NEXT: addw %r8w, %dx -; AVX2-NEXT: cmovol %r10d, %edx -; AVX2-NEXT: vmovd %r9d, %xmm0 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %r12d, %xmm1 -; AVX2-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpextrw $7, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpextrw $3, %xmm0, %r14d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r14d, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r14w -; AVX512-NEXT: cmovol %ecx, %r14d -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: vpextrw $2, %xmm0, %r15d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r15d, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r15w -; AVX512-NEXT: cmovol %ecx, %r15d -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vmovd %xmm0, %r12d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r12d, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r12w -; AVX512-NEXT: cmovol %ecx, %r12d -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: vpextrw $1, %xmm0, %r13d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r13d, %esi -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r13w -; AVX512-NEXT: cmovol %ecx, %r13d -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpextrw $7, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: addw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %ebx -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %ebx, %edi -; AVX512-NEXT: addw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %bx -; AVX512-NEXT: cmovol %ecx, %ebx -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %ebp -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %ebp, %edi -; AVX512-NEXT: addw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %bp -; AVX512-NEXT: cmovol %ecx, %ebp -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %edi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %di -; AVX512-NEXT: cmovol %ecx, %edi -; AVX512-NEXT: vpextrw $3, %xmm1, %ecx -; AVX512-NEXT: vpextrw $3, %xmm0, %eax -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: addw %cx, %r8w -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX512-NEXT: addw %cx, %ax -; AVX512-NEXT: cmovol %edx, %eax -; AVX512-NEXT: vpextrw $2, %xmm1, %r8d -; AVX512-NEXT: vpextrw $2, %xmm0, %ecx -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: addw %r8w, %r9w -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX512-NEXT: addw %r8w, %cx -; AVX512-NEXT: cmovol %edx, %ecx -; AVX512-NEXT: vmovd %xmm1, %r8d -; AVX512-NEXT: vmovd %xmm0, %r9d -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: movl %r9d, %r10d -; AVX512-NEXT: addw %r8w, %r10w -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX512-NEXT: addw %r8w, %r9w -; AVX512-NEXT: cmovol %edx, %r9d -; AVX512-NEXT: vpextrw $1, %xmm1, %r8d -; AVX512-NEXT: vpextrw $1, %xmm0, %edx -; AVX512-NEXT: xorl %r10d, %r10d -; AVX512-NEXT: movl %edx, %r11d -; AVX512-NEXT: addw %r8w, %r11w -; AVX512-NEXT: setns %r10b -; AVX512-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX512-NEXT: addw %r8w, %dx -; AVX512-NEXT: cmovol %r10d, %edx -; AVX512-NEXT: vmovd %r9d, %xmm0 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %r12d, %xmm1 -; AVX512-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z } define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { -; SSE2-LABEL: v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm5, %eax -; SSE2-NEXT: pextrw $1, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm5, %eax -; SSE2-NEXT: pextrw $2, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm5, %eax -; SSE2-NEXT: pextrw $3, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm5, %eax -; SSE2-NEXT: pextrw $4, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $5, %xmm5, %eax -; SSE2-NEXT: pextrw $5, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: pextrw $6, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $7, %xmm5, %eax -; SSE2-NEXT: pextrw $7, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movd %xmm6, %eax -; SSE2-NEXT: movd %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm6, %eax -; SSE2-NEXT: pextrw $1, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm6, %eax -; SSE2-NEXT: pextrw $2, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm6, %eax -; SSE2-NEXT: pextrw $3, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm6, %eax -; SSE2-NEXT: pextrw $4, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $5, %xmm6, %eax -; SSE2-NEXT: pextrw $5, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $6, %xmm6, %eax -; SSE2-NEXT: pextrw $6, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $7, %xmm6, %eax -; SSE2-NEXT: pextrw $7, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movd %xmm7, %eax -; SSE2-NEXT: movd %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm7, %eax -; SSE2-NEXT: pextrw $1, %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm7, %eax -; SSE2-NEXT: pextrw $2, %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm7, %eax -; SSE2-NEXT: pextrw $3, %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm7, %eax -; SSE2-NEXT: pextrw $4, %xmm3, %ebp -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebp, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %bp -; SSE2-NEXT: cmovol %ecx, %ebp -; SSE2-NEXT: pextrw $5, %xmm7, %eax -; SSE2-NEXT: pextrw $5, %xmm3, %ebx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %bx -; SSE2-NEXT: cmovol %ecx, %ebx -; SSE2-NEXT: pextrw $6, %xmm7, %eax -; SSE2-NEXT: pextrw $6, %xmm3, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: pextrw $7, %xmm7, %eax -; SSE2-NEXT: pextrw $7, %xmm3, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: movd %xmm0, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: pextrw $1, %xmm4, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r8d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r8d, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r8w -; SSE2-NEXT: cmovol %ecx, %r8d -; SSE2-NEXT: pextrw $2, %xmm4, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %edi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %di -; SSE2-NEXT: cmovol %ecx, %edi -; SSE2-NEXT: pextrw $3, %xmm4, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: pextrw $4, %xmm4, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edx, %r13d -; SSE2-NEXT: addw %ax, %r13w -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: cmovol %ecx, %edx -; SSE2-NEXT: pextrw $5, %xmm4, %r13d -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movl %ecx, %r12d -; SSE2-NEXT: addw %r13w, %r12w -; SSE2-NEXT: setns %al -; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSE2-NEXT: addw %r13w, %cx -; SSE2-NEXT: cmovol %eax, %ecx -; SSE2-NEXT: pextrw $6, %xmm4, %r12d -; SSE2-NEXT: pextrw $6, %xmm0, %r13d -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movl %r13d, %r15d -; SSE2-NEXT: addw %r12w, %r15w -; SSE2-NEXT: setns %al -; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSE2-NEXT: addw %r12w, %r13w -; SSE2-NEXT: cmovol %eax, %r13d -; SSE2-NEXT: pextrw $7, %xmm4, %r15d -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: xorl %r12d, %r12d -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: addw %r15w, %r14w -; SSE2-NEXT: setns %r12b -; SSE2-NEXT: addl $32767, %r12d # imm = 0x7FFF -; SSE2-NEXT: addw %r15w, %ax -; SSE2-NEXT: cmovol %r12d, %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: movd %r13d, %xmm12 -; SSE2-NEXT: movd %ecx, %xmm8 -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm13 -; SSE2-NEXT: movd %r9d, %xmm5 -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE2-NEXT: movd %r10d, %xmm11 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE2-NEXT: movd %r11d, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE2-NEXT: movd %ebx, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movd %ebp, %xmm15 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: addq $8, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: pushq %rax -; SSSE3-NEXT: movd %xmm5, %eax -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm5, %eax -; SSSE3-NEXT: pextrw $1, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm5, %eax -; SSSE3-NEXT: pextrw $2, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm5, %eax -; SSSE3-NEXT: pextrw $3, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm5, %eax -; SSSE3-NEXT: pextrw $4, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $5, %xmm5, %eax -; SSSE3-NEXT: pextrw $5, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $6, %xmm5, %eax -; SSSE3-NEXT: pextrw $6, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $7, %xmm5, %eax -; SSSE3-NEXT: pextrw $7, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movd %xmm6, %eax -; SSSE3-NEXT: movd %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm6, %eax -; SSSE3-NEXT: pextrw $1, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm6, %eax -; SSSE3-NEXT: pextrw $2, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm6, %eax -; SSSE3-NEXT: pextrw $3, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm6, %eax -; SSSE3-NEXT: pextrw $4, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $5, %xmm6, %eax -; SSSE3-NEXT: pextrw $5, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $6, %xmm6, %eax -; SSSE3-NEXT: pextrw $6, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $7, %xmm6, %eax -; SSSE3-NEXT: pextrw $7, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movd %xmm7, %eax -; SSSE3-NEXT: movd %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm7, %eax -; SSSE3-NEXT: pextrw $1, %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm7, %eax -; SSSE3-NEXT: pextrw $2, %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm7, %eax -; SSSE3-NEXT: pextrw $3, %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm7, %eax -; SSSE3-NEXT: pextrw $4, %xmm3, %ebp -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebp, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %bp -; SSSE3-NEXT: cmovol %ecx, %ebp -; SSSE3-NEXT: pextrw $5, %xmm7, %eax -; SSSE3-NEXT: pextrw $5, %xmm3, %ebx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %bx -; SSSE3-NEXT: cmovol %ecx, %ebx -; SSSE3-NEXT: pextrw $6, %xmm7, %eax -; SSSE3-NEXT: pextrw $6, %xmm3, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: pextrw $7, %xmm7, %eax -; SSSE3-NEXT: pextrw $7, %xmm3, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: movd %xmm4, %eax -; SSSE3-NEXT: movd %xmm0, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: pextrw $1, %xmm4, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r8d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r8d, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r8w -; SSSE3-NEXT: cmovol %ecx, %r8d -; SSSE3-NEXT: pextrw $2, %xmm4, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %edi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %di -; SSSE3-NEXT: cmovol %ecx, %edi -; SSSE3-NEXT: pextrw $3, %xmm4, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: pextrw $4, %xmm4, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edx, %r13d -; SSSE3-NEXT: addw %ax, %r13w -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: cmovol %ecx, %edx -; SSSE3-NEXT: pextrw $5, %xmm4, %r13d -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movl %ecx, %r12d -; SSSE3-NEXT: addw %r13w, %r12w -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSSE3-NEXT: addw %r13w, %cx -; SSSE3-NEXT: cmovol %eax, %ecx -; SSSE3-NEXT: pextrw $6, %xmm4, %r12d -; SSSE3-NEXT: pextrw $6, %xmm0, %r13d -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movl %r13d, %r15d -; SSSE3-NEXT: addw %r12w, %r15w -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSSE3-NEXT: addw %r12w, %r13w -; SSSE3-NEXT: cmovol %eax, %r13d -; SSSE3-NEXT: pextrw $7, %xmm4, %r15d -; SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSSE3-NEXT: xorl %r12d, %r12d -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: addw %r15w, %r14w -; SSSE3-NEXT: setns %r12b -; SSSE3-NEXT: addl $32767, %r12d # imm = 0x7FFF -; SSSE3-NEXT: addw %r15w, %ax -; SSSE3-NEXT: cmovol %r12d, %eax -; SSSE3-NEXT: movd %eax, %xmm10 -; SSSE3-NEXT: movd %r13d, %xmm12 -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm13 -; SSSE3-NEXT: movd %r9d, %xmm5 -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSSE3-NEXT: movd %r10d, %xmm11 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSSE3-NEXT: movd %r11d, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSSE3-NEXT: movd %ebx, %xmm14 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSSE3-NEXT: movd %ebp, %xmm15 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: addq $8, %rsp -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrw $7, %xmm5, %eax -; SSE41-NEXT: pextrw $7, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm5, %eax -; SSE41-NEXT: pextrw $6, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm5, %eax -; SSE41-NEXT: pextrw $5, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm5, %eax -; SSE41-NEXT: pextrw $4, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm5, %eax -; SSE41-NEXT: pextrw $3, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $2, %xmm5, %eax -; SSE41-NEXT: pextrw $2, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movd %xmm5, %eax -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $1, %xmm5, %eax -; SSE41-NEXT: pextrw $1, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $7, %xmm6, %eax -; SSE41-NEXT: pextrw $7, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm6, %eax -; SSE41-NEXT: pextrw $6, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm6, %eax -; SSE41-NEXT: pextrw $5, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm6, %eax -; SSE41-NEXT: pextrw $4, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm6, %eax -; SSE41-NEXT: pextrw $3, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $2, %xmm6, %eax -; SSE41-NEXT: pextrw $2, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movd %xmm6, %eax -; SSE41-NEXT: movd %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $1, %xmm6, %eax -; SSE41-NEXT: pextrw $1, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $7, %xmm7, %eax -; SSE41-NEXT: pextrw $7, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm7, %eax -; SSE41-NEXT: pextrw $6, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm7, %eax -; SSE41-NEXT: pextrw $5, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm7, %eax -; SSE41-NEXT: pextrw $4, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm7, %eax -; SSE41-NEXT: pextrw $3, %xmm3, %ebx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %bx -; SSE41-NEXT: cmovol %ecx, %ebx -; SSE41-NEXT: pextrw $2, %xmm7, %eax -; SSE41-NEXT: pextrw $2, %xmm3, %r11d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r11d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r11w -; SSE41-NEXT: cmovol %ecx, %r11d -; SSE41-NEXT: movd %xmm7, %eax -; SSE41-NEXT: movd %xmm3, %r10d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r10d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r10w -; SSE41-NEXT: cmovol %ecx, %r10d -; SSE41-NEXT: pextrw $1, %xmm7, %eax -; SSE41-NEXT: pextrw $1, %xmm3, %r9d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r9d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r9w -; SSE41-NEXT: cmovol %ecx, %r9d -; SSE41-NEXT: pextrw $7, %xmm4, %eax -; SSE41-NEXT: pextrw $7, %xmm0, %r8d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r8d, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r8w -; SSE41-NEXT: cmovol %ecx, %r8d -; SSE41-NEXT: pextrw $6, %xmm4, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %edi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: cmovol %ecx, %edi -; SSE41-NEXT: pextrw $5, %xmm4, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: pextrw $4, %xmm4, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %edx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edx, %r13d -; SSE41-NEXT: addw %ax, %r13w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: cmovol %ecx, %edx -; SSE41-NEXT: pextrw $3, %xmm4, %eax -; SSE41-NEXT: pextrw $3, %xmm0, %r13d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r13d, %r12d -; SSE41-NEXT: addw %ax, %r12w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r13w -; SSE41-NEXT: cmovol %ecx, %r13d -; SSE41-NEXT: pextrw $2, %xmm4, %r12d -; SSE41-NEXT: pextrw $2, %xmm0, %eax -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: addw %r12w, %r15w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %r12w, %ax -; SSE41-NEXT: cmovol %ecx, %eax -; SSE41-NEXT: movd %xmm4, %r15d -; SSE41-NEXT: movd %xmm0, %r12d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r12d, %r14d -; SSE41-NEXT: addw %r15w, %r14w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %r15w, %r12w -; SSE41-NEXT: cmovol %ecx, %r12d -; SSE41-NEXT: pextrw $1, %xmm4, %r14d -; SSE41-NEXT: pextrw $1, %xmm0, %ecx -; SSE41-NEXT: xorl %r15d, %r15d -; SSE41-NEXT: movl %ecx, %ebp -; SSE41-NEXT: addw %r14w, %bp -; SSE41-NEXT: setns %r15b -; SSE41-NEXT: addl $32767, %r15d # imm = 0x7FFF -; SSE41-NEXT: addw %r14w, %cx -; SSE41-NEXT: cmovol %r15d, %ecx -; SSE41-NEXT: movd %r12d, %xmm0 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 -; SSE41-NEXT: pinsrw $3, %r13d, %xmm0 -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pinsrw $5, %esi, %xmm0 -; SSE41-NEXT: pinsrw $6, %edi, %xmm0 -; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 -; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: movd %r10d, %xmm3 -; SSE41-NEXT: pinsrw $1, %r9d, %xmm3 -; SSE41-NEXT: pinsrw $2, %r11d, %xmm3 -; SSE41-NEXT: pinsrw $3, %ebx, %xmm3 -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v32i16: +; SSE: # %bb.0: +; SSE-NEXT: paddsw %xmm4, %xmm0 +; SSE-NEXT: paddsw %xmm5, %xmm1 +; SSE-NEXT: paddsw %xmm6, %xmm2 +; SSE-NEXT: paddsw %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrw $7, %xmm3, %eax -; AVX1-NEXT: vpextrw $7, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm3, %eax -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm3, %eax -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm3, %eax -; AVX1-NEXT: vpextrw $4, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm3, %eax -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm3, %eax -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vmovd %xmm3, %eax -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $1, %xmm3, %eax -; AVX1-NEXT: vpextrw $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm3, %eax -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrw $7, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm3, %eax -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm3, %eax -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm3, %eax -; AVX1-NEXT: vpextrw $4, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm3, %eax -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm3, %eax -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vmovd %xmm3, %eax -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $1, %xmm3, %eax -; AVX1-NEXT: vpextrw $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $7, %xmm2, %eax -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm2, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm2, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm2, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm2, %eax -; AVX1-NEXT: vpextrw $3, %xmm0, %ebx -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %bx -; AVX1-NEXT: cmovol %ecx, %ebx -; AVX1-NEXT: vpextrw $2, %xmm2, %eax -; AVX1-NEXT: vpextrw $2, %xmm0, %r11d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r11d, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r11w -; AVX1-NEXT: cmovol %ecx, %r11d -; AVX1-NEXT: vmovd %xmm2, %eax -; AVX1-NEXT: vmovd %xmm0, %r10d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r10d, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r10w -; AVX1-NEXT: cmovol %ecx, %r10d -; AVX1-NEXT: vpextrw $1, %xmm2, %eax -; AVX1-NEXT: vpextrw $1, %xmm0, %r9d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r9d, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r9w -; AVX1-NEXT: cmovol %ecx, %r9d -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vpextrw $7, %xmm0, %r8d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r8d, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r8w -; AVX1-NEXT: cmovol %ecx, %r8d -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %edi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %edi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %di -; AVX1-NEXT: cmovol %ecx, %edi -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %edx -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %edx, %r13d -; AVX1-NEXT: addw %ax, %r13w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %dx -; AVX1-NEXT: cmovol %ecx, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: vpextrw $3, %xmm0, %r13d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r13d, %r12d -; AVX1-NEXT: addw %ax, %r12w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r13w -; AVX1-NEXT: cmovol %ecx, %r13d -; AVX1-NEXT: vpextrw $2, %xmm1, %r12d -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: addw %r12w, %r15w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %r12w, %ax -; AVX1-NEXT: cmovol %ecx, %eax -; AVX1-NEXT: vmovd %xmm1, %r15d -; AVX1-NEXT: vmovd %xmm0, %r12d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r12d, %r14d -; AVX1-NEXT: addw %r15w, %r14w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %r15w, %r12w -; AVX1-NEXT: cmovol %ecx, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %r14d -; AVX1-NEXT: vpextrw $1, %xmm0, %ecx -; AVX1-NEXT: xorl %r15d, %r15d -; AVX1-NEXT: movl %ecx, %ebp -; AVX1-NEXT: addw %r14w, %bp -; AVX1-NEXT: setns %r15b -; AVX1-NEXT: addl $32767, %r15d # imm = 0x7FFF -; AVX1-NEXT: addw %r14w, %cx -; AVX1-NEXT: cmovol %r15d, %ecx -; AVX1-NEXT: vmovd %r12d, %xmm0 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %r10d, %xmm1 -; AVX1-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddsw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddsw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrw $7, %xmm3, %eax -; AVX2-NEXT: vpextrw $7, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm3, %eax -; AVX2-NEXT: vpextrw $6, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm3, %eax -; AVX2-NEXT: vpextrw $5, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm3, %eax -; AVX2-NEXT: vpextrw $4, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm3, %eax -; AVX2-NEXT: vpextrw $3, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $2, %xmm3, %eax -; AVX2-NEXT: vpextrw $2, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vmovd %xmm3, %eax -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $1, %xmm3, %eax -; AVX2-NEXT: vpextrw $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpextrw $7, %xmm3, %eax -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrw $7, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm3, %eax -; AVX2-NEXT: vpextrw $6, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm3, %eax -; AVX2-NEXT: vpextrw $5, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm3, %eax -; AVX2-NEXT: vpextrw $4, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm3, %eax -; AVX2-NEXT: vpextrw $3, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $2, %xmm3, %eax -; AVX2-NEXT: vpextrw $2, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vmovd %xmm3, %eax -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $1, %xmm3, %eax -; AVX2-NEXT: vpextrw $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $7, %xmm2, %eax -; AVX2-NEXT: vpextrw $7, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm2, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm2, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm2, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm2, %eax -; AVX2-NEXT: vpextrw $3, %xmm0, %ebx -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %bx -; AVX2-NEXT: cmovol %ecx, %ebx -; AVX2-NEXT: vpextrw $2, %xmm2, %eax -; AVX2-NEXT: vpextrw $2, %xmm0, %r11d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r11d, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r11w -; AVX2-NEXT: cmovol %ecx, %r11d -; AVX2-NEXT: vmovd %xmm2, %eax -; AVX2-NEXT: vmovd %xmm0, %r10d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r10d, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r10w -; AVX2-NEXT: cmovol %ecx, %r10d -; AVX2-NEXT: vpextrw $1, %xmm2, %eax -; AVX2-NEXT: vpextrw $1, %xmm0, %r9d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r9d, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r9w -; AVX2-NEXT: cmovol %ecx, %r9d -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: vpextrw $7, %xmm0, %r8d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r8d, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r8w -; AVX2-NEXT: cmovol %ecx, %r8d -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %edi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %edi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %di -; AVX2-NEXT: cmovol %ecx, %edi -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %edx -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %edx, %r13d -; AVX2-NEXT: addw %ax, %r13w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %dx -; AVX2-NEXT: cmovol %ecx, %edx -; AVX2-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NEXT: vpextrw $3, %xmm0, %r13d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r13d, %r12d -; AVX2-NEXT: addw %ax, %r12w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r13w -; AVX2-NEXT: cmovol %ecx, %r13d -; AVX2-NEXT: vpextrw $2, %xmm1, %r12d -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: addw %r12w, %r15w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %r12w, %ax -; AVX2-NEXT: cmovol %ecx, %eax -; AVX2-NEXT: vmovd %xmm1, %r15d -; AVX2-NEXT: vmovd %xmm0, %r12d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r12d, %r14d -; AVX2-NEXT: addw %r15w, %r14w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %r15w, %r12w -; AVX2-NEXT: cmovol %ecx, %r12d -; AVX2-NEXT: vpextrw $1, %xmm1, %r14d -; AVX2-NEXT: vpextrw $1, %xmm0, %ecx -; AVX2-NEXT: xorl %r15d, %r15d -; AVX2-NEXT: movl %ecx, %ebp -; AVX2-NEXT: addw %r14w, %bp -; AVX2-NEXT: setns %r15b -; AVX2-NEXT: addl $32767, %r15d # imm = 0x7FFF -; AVX2-NEXT: addw %r14w, %cx -; AVX2-NEXT: cmovol %r15d, %ecx -; AVX2-NEXT: vmovd %r12d, %xmm0 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %r10d, %xmm1 -; AVX2-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: # xmm2 = mem[0],zero,zero,zero -; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: # xmm3 = mem[0],zero,zero,zero -; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpextrw $7, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpextrw $3, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: vpextrw $2, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: vpextrw $1, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrw $7, %xmm2, %eax -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrw $7, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm2, %eax -; AVX512-NEXT: vpextrw $6, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm2, %eax -; AVX512-NEXT: vpextrw $5, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm2, %eax -; AVX512-NEXT: vpextrw $4, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm2, %eax -; AVX512-NEXT: vpextrw $3, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $2, %xmm2, %eax -; AVX512-NEXT: vpextrw $2, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $1, %xmm2, %eax -; AVX512-NEXT: vpextrw $1, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vpextrw $7, %xmm2, %eax -; AVX512-NEXT: vpextrw $7, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm2, %eax -; AVX512-NEXT: vpextrw $6, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm2, %eax -; AVX512-NEXT: vpextrw $5, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm2, %eax -; AVX512-NEXT: vpextrw $4, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm2, %eax -; AVX512-NEXT: vpextrw $3, %xmm3, %ebx -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %bx -; AVX512-NEXT: cmovol %ecx, %ebx -; AVX512-NEXT: vpextrw $2, %xmm2, %eax -; AVX512-NEXT: vpextrw $2, %xmm3, %r11d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r11d, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r11w -; AVX512-NEXT: cmovol %ecx, %r11d -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vmovd %xmm3, %r10d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r10d, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r10w -; AVX512-NEXT: cmovol %ecx, %r10d -; AVX512-NEXT: vpextrw $1, %xmm2, %eax -; AVX512-NEXT: vpextrw $1, %xmm3, %r9d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r9d, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r9w -; AVX512-NEXT: cmovol %ecx, %r9d -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpextrw $7, %xmm0, %r8d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r8d, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r8w -; AVX512-NEXT: cmovol %ecx, %r8d -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %edi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %di -; AVX512-NEXT: cmovol %ecx, %edi -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %edx -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %edx, %r13d -; AVX512-NEXT: addw %ax, %r13w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %dx -; AVX512-NEXT: cmovol %ecx, %edx -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpextrw $3, %xmm0, %r13d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r13d, %r12d -; AVX512-NEXT: addw %ax, %r12w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r13w -; AVX512-NEXT: cmovol %ecx, %r13d -; AVX512-NEXT: vpextrw $2, %xmm1, %r12d -; AVX512-NEXT: vpextrw $2, %xmm0, %eax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: addw %r12w, %r15w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %r12w, %ax -; AVX512-NEXT: cmovol %ecx, %eax -; AVX512-NEXT: vmovd %xmm1, %r15d -; AVX512-NEXT: vmovd %xmm0, %r12d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r12d, %r14d -; AVX512-NEXT: addw %r15w, %r14w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %r15w, %r12w -; AVX512-NEXT: cmovol %ecx, %r12d -; AVX512-NEXT: vpextrw $1, %xmm1, %r14d -; AVX512-NEXT: vpextrw $1, %xmm0, %ecx -; AVX512-NEXT: xorl %r15d, %r15d -; AVX512-NEXT: movl %ecx, %ebp -; AVX512-NEXT: addw %r14w, %bp -; AVX512-NEXT: setns %r15b -; AVX512-NEXT: addl $32767, %r15d # imm = 0x7FFF -; AVX512-NEXT: addw %r14w, %cx -; AVX512-NEXT: cmovol %r15d, %ecx -; AVX512-NEXT: vmovd %r12d, %xmm0 -; AVX512-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %r10d, %xmm1 -; AVX512-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: # xmm3 = mem[0],zero,zero,zero -; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z @@ -13489,940 +196,76 @@ ; Too narrow vectors, legalized by widening. define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { -; SSE2-LABEL: v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: movd %xmm0, %r8d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r8d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r8w -; SSE2-NEXT: cmovol %ecx, %r8d -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %r14d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r14d, %edi -; SSE2-NEXT: addw %cx, %di -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: addw %cx, %r14w -; SSE2-NEXT: cmovol %esi, %r14d -; SSE2-NEXT: pextrw $5, %xmm1, %esi -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: addw %si, %bx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE2-NEXT: addw %si, %cx -; SSE2-NEXT: cmovol %edi, %ecx -; SSE2-NEXT: pextrw $6, %xmm1, %edi -; SSE2-NEXT: pextrw $6, %xmm0, %esi -; SSE2-NEXT: xorl %ebx, %ebx -; SSE2-NEXT: movl %esi, %ebp -; SSE2-NEXT: addw %di, %bp -; SSE2-NEXT: setns %bl -; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE2-NEXT: addw %di, %si -; SSE2-NEXT: cmovol %ebx, %esi -; SSE2-NEXT: pextrw $7, %xmm1, %edi -; SSE2-NEXT: pextrw $7, %xmm0, %ebx -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addw %di, %ax -; SSE2-NEXT: setns %bpl -; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE2-NEXT: addw %di, %bx -; SSE2-NEXT: cmovol %ebp, %ebx -; SSE2-NEXT: movd %ebx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %r11d, %xmm0 -; SSE2-NEXT: movd %r10d, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 -; SSE2-NEXT: movq %xmm3, (%rdx) -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: movd %xmm0, %r8d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r8d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r8w -; SSSE3-NEXT: cmovol %ecx, %r8d -; SSSE3-NEXT: pextrw $1, %xmm1, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: pextrw $2, %xmm1, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: pextrw $3, %xmm1, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: pextrw $4, %xmm1, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %r14d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r14d, %edi -; SSSE3-NEXT: addw %cx, %di -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: addw %cx, %r14w -; SSSE3-NEXT: cmovol %esi, %r14d -; SSSE3-NEXT: pextrw $5, %xmm1, %esi -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: addw %si, %bx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSSE3-NEXT: addw %si, %cx -; SSSE3-NEXT: cmovol %edi, %ecx -; SSSE3-NEXT: pextrw $6, %xmm1, %edi -; SSSE3-NEXT: pextrw $6, %xmm0, %esi -; SSSE3-NEXT: xorl %ebx, %ebx -; SSSE3-NEXT: movl %esi, %ebp -; SSSE3-NEXT: addw %di, %bp -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSSE3-NEXT: addw %di, %si -; SSSE3-NEXT: cmovol %ebx, %esi -; SSSE3-NEXT: pextrw $7, %xmm1, %edi -; SSSE3-NEXT: pextrw $7, %xmm0, %ebx -; SSSE3-NEXT: xorl %ebp, %ebp -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addw %di, %ax -; SSSE3-NEXT: setns %bpl -; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSSE3-NEXT: addw %di, %bx -; SSSE3-NEXT: cmovol %ebp, %ebx -; SSSE3-NEXT: movd %ebx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movd %r14d, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %r11d, %xmm0 -; SSSE3-NEXT: movd %r10d, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: packuswb %xmm0, %xmm3 -; SSSE3-NEXT: movq %xmm3, (%rdx) -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE41-NEXT: pextrw $7, %xmm1, %eax -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE41-NEXT: pextrw $7, %xmm0, %r8d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r8d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r8w -; SSE41-NEXT: cmovol %ecx, %r8d -; SSE41-NEXT: pextrw $6, %xmm1, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %r9d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r9d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r9w -; SSE41-NEXT: cmovol %ecx, %r9d -; SSE41-NEXT: pextrw $5, %xmm1, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %r10d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r10d, %edi -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r10w -; SSE41-NEXT: cmovol %ecx, %r10d -; SSE41-NEXT: pextrw $4, %xmm1, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %r11d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r11d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r11w -; SSE41-NEXT: cmovol %ecx, %r11d -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %r14d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r14d, %edi -; SSE41-NEXT: addw %cx, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: addw %cx, %r14w -; SSE41-NEXT: cmovol %esi, %r14d -; SSE41-NEXT: pextrw $2, %xmm1, %esi -; SSE41-NEXT: pextrw $2, %xmm0, %ecx -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %ecx, %ebx -; SSE41-NEXT: addw %si, %bx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: addw %si, %cx -; SSE41-NEXT: cmovol %edi, %ecx -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: xorl %ebx, %ebx -; SSE41-NEXT: movl %edi, %ebp -; SSE41-NEXT: addw %si, %bp -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE41-NEXT: addw %si, %di -; SSE41-NEXT: cmovol %ebx, %edi -; SSE41-NEXT: pextrw $1, %xmm1, %esi -; SSE41-NEXT: pextrw $1, %xmm0, %ebx -; SSE41-NEXT: xorl %ebp, %ebp -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: addw %si, %ax -; SSE41-NEXT: setns %bpl -; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE41-NEXT: addw %si, %bx -; SSE41-NEXT: cmovol %ebp, %ebx -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pinsrw $1, %ebx, %xmm0 -; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrw $3, %r14d, %xmm0 -; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 -; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 -; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 -; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdx) -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: paddsb %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpextrw $7, %xmm1, %r8d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r8d, %esi -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r8w -; AVX1-NEXT: cmovol %ecx, %r8d -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: vpextrw $6, %xmm1, %r9d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r9d, %esi -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r9w -; AVX1-NEXT: cmovol %ecx, %r9d -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: vpextrw $5, %xmm1, %r10d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r10d, %edi -; AVX1-NEXT: addw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r10w -; AVX1-NEXT: cmovol %ecx, %r10d -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: vpextrw $4, %xmm1, %r11d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r11d, %esi -; AVX1-NEXT: addw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: addw %ax, %r11w -; AVX1-NEXT: cmovol %ecx, %r11d -; AVX1-NEXT: vpextrw $3, %xmm0, %ecx -; AVX1-NEXT: vpextrw $3, %xmm1, %r14d -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movl %r14d, %edi -; AVX1-NEXT: addw %cx, %di -; AVX1-NEXT: setns %sil -; AVX1-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX1-NEXT: addw %cx, %r14w -; AVX1-NEXT: cmovol %esi, %r14d -; AVX1-NEXT: vpextrw $2, %xmm0, %esi -; AVX1-NEXT: vpextrw $2, %xmm1, %ecx -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movl %ecx, %ebx -; AVX1-NEXT: addw %si, %bx -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX1-NEXT: addw %si, %cx -; AVX1-NEXT: cmovol %edi, %ecx -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: vmovd %xmm1, %edi -; AVX1-NEXT: xorl %ebx, %ebx -; AVX1-NEXT: movl %edi, %ebp -; AVX1-NEXT: addw %si, %bp -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX1-NEXT: addw %si, %di -; AVX1-NEXT: cmovol %ebx, %edi -; AVX1-NEXT: vpextrw $1, %xmm0, %esi -; AVX1-NEXT: vpextrw $1, %xmm1, %ebx -; AVX1-NEXT: xorl %ebp, %ebp -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: addw %si, %ax -; AVX1-NEXT: setns %bpl -; AVX1-NEXT: addl $32767, %ebp # imm = 0x7FFF -; AVX1-NEXT: addw %si, %bx -; AVX1-NEXT: cmovol %ebp, %ebx -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-NEXT: vpextrw $7, %xmm1, %r8d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r8d, %esi -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r8w -; AVX2-NEXT: cmovol %ecx, %r8d -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: vpextrw $6, %xmm1, %r9d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r9d, %esi -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r9w -; AVX2-NEXT: cmovol %ecx, %r9d -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: vpextrw $5, %xmm1, %r10d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r10d, %edi -; AVX2-NEXT: addw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r10w -; AVX2-NEXT: cmovol %ecx, %r10d -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: vpextrw $4, %xmm1, %r11d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r11d, %esi -; AVX2-NEXT: addw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: addw %ax, %r11w -; AVX2-NEXT: cmovol %ecx, %r11d -; AVX2-NEXT: vpextrw $3, %xmm0, %ecx -; AVX2-NEXT: vpextrw $3, %xmm1, %r14d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl %r14d, %edi -; AVX2-NEXT: addw %cx, %di -; AVX2-NEXT: setns %sil -; AVX2-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX2-NEXT: addw %cx, %r14w -; AVX2-NEXT: cmovol %esi, %r14d -; AVX2-NEXT: vpextrw $2, %xmm0, %esi -; AVX2-NEXT: vpextrw $2, %xmm1, %ecx -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movl %ecx, %ebx -; AVX2-NEXT: addw %si, %bx -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX2-NEXT: addw %si, %cx -; AVX2-NEXT: cmovol %edi, %ecx -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: vmovd %xmm1, %edi -; AVX2-NEXT: xorl %ebx, %ebx -; AVX2-NEXT: movl %edi, %ebp -; AVX2-NEXT: addw %si, %bp -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX2-NEXT: addw %si, %di -; AVX2-NEXT: cmovol %ebx, %edi -; AVX2-NEXT: vpextrw $1, %xmm0, %esi -; AVX2-NEXT: vpextrw $1, %xmm1, %ebx -; AVX2-NEXT: xorl %ebp, %ebp -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: addw %si, %ax -; AVX2-NEXT: setns %bpl -; AVX2-NEXT: addl $32767, %ebp # imm = 0x7FFF -; AVX2-NEXT: addw %si, %bx -; AVX2-NEXT: cmovol %ebp, %ebx -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: retq -; -; AVX512-LABEL: v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpextrw $7, %xmm0, %eax -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpextrw $7, %xmm1, %r8d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r8d, %esi -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r8w -; AVX512-NEXT: cmovol %ecx, %r8d -; AVX512-NEXT: vpextrw $6, %xmm0, %eax -; AVX512-NEXT: vpextrw $6, %xmm1, %r9d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r9d, %esi -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r9w -; AVX512-NEXT: cmovol %ecx, %r9d -; AVX512-NEXT: vpextrw $5, %xmm0, %eax -; AVX512-NEXT: vpextrw $5, %xmm1, %r10d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r10d, %edi -; AVX512-NEXT: addw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r10w -; AVX512-NEXT: cmovol %ecx, %r10d -; AVX512-NEXT: vpextrw $4, %xmm0, %eax -; AVX512-NEXT: vpextrw $4, %xmm1, %r11d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r11d, %esi -; AVX512-NEXT: addw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: addw %ax, %r11w -; AVX512-NEXT: cmovol %ecx, %r11d -; AVX512-NEXT: vpextrw $3, %xmm0, %ecx -; AVX512-NEXT: vpextrw $3, %xmm1, %r14d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl %r14d, %edi -; AVX512-NEXT: addw %cx, %di -; AVX512-NEXT: setns %sil -; AVX512-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX512-NEXT: addw %cx, %r14w -; AVX512-NEXT: cmovol %esi, %r14d -; AVX512-NEXT: vpextrw $2, %xmm0, %esi -; AVX512-NEXT: vpextrw $2, %xmm1, %ecx -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movl %ecx, %ebx -; AVX512-NEXT: addw %si, %bx -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX512-NEXT: addw %si, %cx -; AVX512-NEXT: cmovol %edi, %ecx -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: vmovd %xmm1, %edi -; AVX512-NEXT: xorl %ebx, %ebx -; AVX512-NEXT: movl %edi, %ebp -; AVX512-NEXT: addw %si, %bp -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX512-NEXT: addw %si, %di -; AVX512-NEXT: cmovol %ebx, %edi -; AVX512-NEXT: vpextrw $1, %xmm0, %esi -; AVX512-NEXT: vpextrw $1, %xmm1, %ebx -; AVX512-NEXT: xorl %ebp, %ebp -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: addw %si, %ax -; AVX512-NEXT: setns %bpl -; AVX512-NEXT: addl $32767, %ebp # imm = 0x7FFF -; AVX512-NEXT: addw %si, %bx -; AVX512-NEXT: cmovol %ebp, %ebx -; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpmovwb %xmm0, (%rdx) -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq - %x = load <8 x i8>, <8 x i8>* %px - %y = load <8 x i8>, <8 x i8>* %py - %z = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y) - store <8 x i8> %z, <8 x i8>* %pz - ret void -} - -define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { -; SSE2-LABEL: v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $24, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %r8d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r8d, %edi -; SSE2-NEXT: addl %ecx, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %ecx, %r8d -; SSE2-NEXT: cmovol %esi, %r8d -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: movd %xmm0, %r10d -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %r10d, %ecx -; SSE2-NEXT: addl %esi, %ecx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %esi, %r10d -; SSE2-NEXT: cmovol %edi, %r10d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: addl %r9d, %esi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %r9d, %ecx -; SSE2-NEXT: cmovol %edi, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: addl %r9d, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %r9d, %eax -; SSE2-NEXT: cmovol %esi, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrld $24, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm2, %r8d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r8d, %edi -; SSSE3-NEXT: addl %ecx, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %ecx, %r8d -; SSSE3-NEXT: cmovol %esi, %r8d -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: movd %xmm0, %r10d -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %r10d, %ecx -; SSSE3-NEXT: addl %esi, %ecx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %esi, %r10d -; SSSE3-NEXT: cmovol %edi, %r10d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: addl %r9d, %esi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %r9d, %ecx -; SSSE3-NEXT: cmovol %edi, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: addl %r9d, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %r9d, %eax -; SSSE3-NEXT: cmovol %esi, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm0, (%rdx) -; SSSE3-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq ; -; SSE41-LABEL: v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pslld $24, %xmm1 -; SSE41-NEXT: pextrd $3, %xmm1, %ecx -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: pextrd $3, %xmm0, %r8d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r8d, %edi -; SSE41-NEXT: addl %ecx, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %ecx, %r8d -; SSE41-NEXT: cmovol %esi, %r8d -; SSE41-NEXT: pextrd $2, %xmm1, %esi -; SSE41-NEXT: pextrd $2, %xmm0, %r10d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r10d, %ecx -; SSE41-NEXT: addl %esi, %ecx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %esi, %r10d -; SSE41-NEXT: cmovol %edi, %r10d -; SSE41-NEXT: movd %xmm1, %r9d -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: addl %r9d, %esi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %r9d, %ecx -; SSE41-NEXT: cmovol %edi, %ecx -; SSE41-NEXT: pextrd $1, %xmm1, %r9d -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: addl %r9d, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %r9d, %eax -; SSE41-NEXT: cmovol %esi, %eax -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm0, (%rdx) -; SSE41-NEXT: retq +; AVX512-LABEL: v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovwb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <8 x i8>, <8 x i8>* %px + %y = load <8 x i8>, <8 x i8>* %py + %z = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y) + store <8 x i8> %z, <8 x i8>* %pz + ret void +} + +define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { +; SSE-LABEL: v4i8: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: paddsb %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %ecx -; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $3, %xmm1, %r9d -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movl %r9d, %edi -; AVX1-NEXT: addl %ecx, %edi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %ecx, %r9d -; AVX1-NEXT: cmovol %esi, %r9d -; AVX1-NEXT: vpextrd $2, %xmm0, %r8d -; AVX1-NEXT: vpextrd $2, %xmm1, %r10d -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movl %r10d, %ecx -; AVX1-NEXT: addl %r8d, %ecx -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %r8d, %r10d -; AVX1-NEXT: cmovol %edi, %r10d -; AVX1-NEXT: vmovd %xmm0, %r8d -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: addl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %r8d, %eax -; AVX1-NEXT: cmovol %ecx, %eax -; AVX1-NEXT: vpextrd $1, %xmm0, %r8d -; AVX1-NEXT: vpextrd $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edi -; AVX1-NEXT: addl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %r8d, %esi -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx -; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX2-NEXT: vpextrd $3, %xmm1, %r9d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl %r9d, %edi -; AVX2-NEXT: addl %ecx, %edi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %ecx, %r9d -; AVX2-NEXT: cmovol %esi, %r9d -; AVX2-NEXT: vpextrd $2, %xmm0, %r8d -; AVX2-NEXT: vpextrd $2, %xmm1, %r10d -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movl %r10d, %ecx -; AVX2-NEXT: addl %r8d, %ecx -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %r8d, %r10d -; AVX2-NEXT: cmovol %edi, %r10d -; AVX2-NEXT: vmovd %xmm0, %r8d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: addl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %r8d, %eax -; AVX2-NEXT: cmovol %ecx, %eax -; AVX2-NEXT: vpextrd $1, %xmm0, %r8d -; AVX2-NEXT: vpextrd $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: addl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %r8d, %esi -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $3, %xmm0, %ecx -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $3, %xmm1, %r9d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl %r9d, %edi -; AVX512-NEXT: addl %ecx, %edi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %ecx, %r9d -; AVX512-NEXT: cmovol %esi, %r9d -; AVX512-NEXT: vpextrd $2, %xmm0, %r8d -; AVX512-NEXT: vpextrd $2, %xmm1, %r10d -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movl %r10d, %ecx -; AVX512-NEXT: addl %r8d, %ecx -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %r8d, %r10d -; AVX512-NEXT: cmovol %edi, %r10d -; AVX512-NEXT: vmovd %xmm0, %r8d -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: addl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %r8d, %eax -; AVX512-NEXT: cmovol %ecx, %eax -; AVX512-NEXT: vpextrd $1, %xmm0, %r8d -; AVX512-NEXT: vpextrd $1, %xmm1, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: addl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %r8d, %esi -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vpmovdb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px @@ -14437,45 +280,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE2-NEXT: movzwl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE2-NEXT: psllq $56, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movq %rcx, %rdi -; SSE2-NEXT: addq %rax, %rdi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE2-NEXT: addq %r8, %rsi -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: cmovoq %rsi, %rcx -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: addq %r9, %rsi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addq %r8, %rdi -; SSE2-NEXT: addq %r9, %rax -; SSE2-NEXT: cmovoq %rdi, %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlq $56, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: paddsb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movw %ax, (%rdx) ; SSE2-NEXT: retq ; @@ -14485,131 +293,38 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: movzwl (%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm3, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %rcx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movq %rcx, %rdi -; SSSE3-NEXT: addq %rax, %rdi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSSE3-NEXT: addq %r8, %rsi -; SSSE3-NEXT: addq %rax, %rcx -; SSSE3-NEXT: cmovoq %rsi, %rcx -; SSSE3-NEXT: movq %xmm1, %r9 -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: addq %r9, %rsi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addq %r8, %rdi -; SSSE3-NEXT: addq %r9, %rax -; SSSE3-NEXT: cmovoq %rdi, %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: movq %rcx, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: paddsb %xmm0, %xmm1 +; SSSE3-NEXT: movd %xmm1, %eax ; SSSE3-NEXT: movw %ax, (%rdx) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psllq $56, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movq %rcx, %rdi -; SSE41-NEXT: addq %rax, %rdi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE41-NEXT: addq %r8, %rsi -; SSE41-NEXT: addq %rax, %rcx -; SSE41-NEXT: cmovoq %rsi, %rcx -; SSE41-NEXT: pextrq $1, %xmm1, %r9 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: addq %r9, %rsi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addq %r8, %rdi -; SSE41-NEXT: addq %r9, %rax -; SSE41-NEXT: cmovoq %rdi, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzwl (%rsi), %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: paddsb %xmm0, %xmm1 ; SSE41-NEXT: pextrw $0, %xmm1, (%rdx) ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: addq %rax, %rdi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX1-NEXT: addq %r8, %rsi -; AVX1-NEXT: addq %rax, %rcx -; AVX1-NEXT: cmovoq %rsi, %rcx -; AVX1-NEXT: vpextrq $1, %xmm1, %r9 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: addq %r9, %rsi -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addq %r8, %rdi -; AVX1-NEXT: addq %r9, %rax -; AVX1-NEXT: cmovoq %rdi, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movzwl (%rsi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movq %rcx, %rdi -; AVX2-NEXT: addq %rax, %rdi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX2-NEXT: addq %r8, %rsi -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: cmovoq %rsi, %rcx -; AVX2-NEXT: vpextrq $1, %xmm1, %r9 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: addq %r9, %rsi -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addq %r8, %rdi -; AVX2-NEXT: addq %r9, %rax -; AVX2-NEXT: cmovoq %rdi, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: movzwl (%rsi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -14619,32 +334,8 @@ ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: movzwl (%rsi), %eax ; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movq %rcx, %rdi -; AVX512-NEXT: addq %rax, %rdi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX512-NEXT: addq %r8, %rsi -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: cmovoq %rsi, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: addq %r9, %rsi -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addq %r8, %rdi -; AVX512-NEXT: addq %r9, %rax -; AVX512-NEXT: cmovoq %rdi, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vmovq %rcx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmovqb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px @@ -14655,336 +346,36 @@ } define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { -; SSE2-LABEL: v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %r8d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r8d, %edi -; SSE2-NEXT: addl %ecx, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %ecx, %r8d -; SSE2-NEXT: cmovol %esi, %r8d -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: movd %xmm0, %r10d -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %r10d, %ecx -; SSE2-NEXT: addl %esi, %ecx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %esi, %r10d -; SSE2-NEXT: cmovol %edi, %r10d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: addl %r9d, %esi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %r9d, %ecx -; SSE2-NEXT: cmovol %edi, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: addl %r9d, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: addl %r9d, %eax -; SSE2-NEXT: cmovol %esi, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm2, %r8d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r8d, %edi -; SSSE3-NEXT: addl %ecx, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %ecx, %r8d -; SSSE3-NEXT: cmovol %esi, %r8d -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: movd %xmm0, %r10d -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %r10d, %ecx -; SSSE3-NEXT: addl %esi, %ecx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %esi, %r10d -; SSSE3-NEXT: cmovol %edi, %r10d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: addl %r9d, %esi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %r9d, %ecx -; SSSE3-NEXT: cmovol %edi, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: addl %r9d, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: addl %r9d, %eax -; SSSE3-NEXT: cmovol %esi, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero -; SSSE3-NEXT: movq %xmm0, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE41-NEXT: pextrd $3, %xmm1, %ecx -; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE41-NEXT: pextrd $3, %xmm0, %r8d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r8d, %edi -; SSE41-NEXT: addl %ecx, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %ecx, %r8d -; SSE41-NEXT: cmovol %esi, %r8d -; SSE41-NEXT: pextrd $2, %xmm1, %esi -; SSE41-NEXT: pextrd $2, %xmm0, %r10d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r10d, %ecx -; SSE41-NEXT: addl %esi, %ecx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %esi, %r10d -; SSE41-NEXT: cmovol %edi, %r10d -; SSE41-NEXT: movd %xmm1, %r9d -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: addl %r9d, %esi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %r9d, %ecx -; SSE41-NEXT: cmovol %edi, %ecx -; SSE41-NEXT: pextrd $1, %xmm1, %r9d -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: addl %r9d, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: addl %r9d, %eax -; SSE41-NEXT: cmovol %esi, %eax -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: paddsw %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-NEXT: vpextrd $3, %xmm0, %ecx -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpextrd $3, %xmm1, %r9d -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movl %r9d, %edi -; AVX1-NEXT: addl %ecx, %edi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %ecx, %r9d -; AVX1-NEXT: cmovol %esi, %r9d -; AVX1-NEXT: vpextrd $2, %xmm0, %r8d -; AVX1-NEXT: vpextrd $2, %xmm1, %r10d -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movl %r10d, %ecx -; AVX1-NEXT: addl %r8d, %ecx -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %r8d, %r10d -; AVX1-NEXT: cmovol %edi, %r10d -; AVX1-NEXT: vmovd %xmm0, %r8d -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: addl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %r8d, %eax -; AVX1-NEXT: cmovol %ecx, %eax -; AVX1-NEXT: vpextrd $1, %xmm0, %r8d -; AVX1-NEXT: vpextrd $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edi -; AVX1-NEXT: addl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: addl %r8d, %esi -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpextrd $3, %xmm1, %r9d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl %r9d, %edi -; AVX2-NEXT: addl %ecx, %edi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %ecx, %r9d -; AVX2-NEXT: cmovol %esi, %r9d -; AVX2-NEXT: vpextrd $2, %xmm0, %r8d -; AVX2-NEXT: vpextrd $2, %xmm1, %r10d -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movl %r10d, %ecx -; AVX2-NEXT: addl %r8d, %ecx -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %r8d, %r10d -; AVX2-NEXT: cmovol %edi, %r10d -; AVX2-NEXT: vmovd %xmm0, %r8d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: addl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %r8d, %eax -; AVX2-NEXT: cmovol %ecx, %eax -; AVX2-NEXT: vpextrd $1, %xmm0, %r8d -; AVX2-NEXT: vpextrd $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: addl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: addl %r8d, %esi -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $3, %xmm0, %ecx -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $3, %xmm1, %r9d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl %r9d, %edi -; AVX512-NEXT: addl %ecx, %edi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %ecx, %r9d -; AVX512-NEXT: cmovol %esi, %r9d -; AVX512-NEXT: vpextrd $2, %xmm0, %r8d -; AVX512-NEXT: vpextrd $2, %xmm1, %r10d -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movl %r10d, %ecx -; AVX512-NEXT: addl %r8d, %ecx -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %r8d, %r10d -; AVX512-NEXT: cmovol %edi, %r10d -; AVX512-NEXT: vmovd %xmm0, %r8d -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: addl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %r8d, %eax -; AVX512-NEXT: cmovol %ecx, %eax -; AVX512-NEXT: vpextrd $1, %xmm0, %r8d -; AVX512-NEXT: vpextrd $1, %xmm1, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: addl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: addl %r8d, %esi -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vpmovdw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px @@ -14992,177 +383,30 @@ %z = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %x, <4 x i16> %y) store <4 x i16> %z, <4 x i16>* %pz ret void -} - -define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { -; SSE2-LABEL: v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $48, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movq %rcx, %rdi -; SSE2-NEXT: addq %rax, %rdi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE2-NEXT: addq %r8, %rsi -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: cmovoq %rsi, %rcx -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: addq %r9, %rsi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addq %r8, %rdi -; SSE2-NEXT: addq %r9, %rax -; SSE2-NEXT: cmovoq %rdi, %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlq $48, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movd %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v2i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm3, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %rcx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movq %rcx, %rdi -; SSSE3-NEXT: addq %rax, %rdi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSSE3-NEXT: addq %r8, %rsi -; SSSE3-NEXT: addq %rax, %rcx -; SSSE3-NEXT: cmovoq %rsi, %rcx -; SSSE3-NEXT: movq %xmm1, %r9 -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: addq %r9, %rsi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addq %r8, %rdi -; SSSE3-NEXT: addq %r9, %rax -; SSSE3-NEXT: cmovoq %rdi, %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: movq %rcx, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movd %xmm0, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: psllq $48, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movq %rcx, %rdi -; SSE41-NEXT: addq %rax, %rdi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE41-NEXT: addq %r8, %rsi -; SSE41-NEXT: addq %rax, %rcx -; SSE41-NEXT: cmovoq %rsi, %rcx -; SSE41-NEXT: pextrq $1, %xmm1, %r9 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: addq %r9, %rsi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addq %r8, %rdi -; SSE41-NEXT: addq %r9, %rax -; SSE41-NEXT: cmovoq %rdi, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,14,15],zero,zero,xmm1[14,15],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movd %xmm1, (%rdx) -; SSE41-NEXT: retq +} + +define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { +; SSE-LABEL: v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: paddsw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: addq %rax, %rdi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX1-NEXT: addq %r8, %rsi -; AVX1-NEXT: addq %rax, %rcx -; AVX1-NEXT: cmovoq %rsi, %rcx -; AVX1-NEXT: vpextrq $1, %xmm1, %r9 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: addq %r9, %rsi -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addq %r8, %rdi -; AVX1-NEXT: addq %r9, %rax -; AVX1-NEXT: cmovoq %rdi, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movq %rcx, %rdi -; AVX2-NEXT: addq %rax, %rdi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX2-NEXT: addq %r8, %rsi -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: cmovoq %rsi, %rcx -; AVX2-NEXT: vpextrq $1, %xmm1, %r9 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: addq %r9, %rsi -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addq %r8, %rdi -; AVX2-NEXT: addq %r9, %rax -; AVX2-NEXT: cmovoq %rdi, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -15170,32 +414,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movq %rcx, %rdi -; AVX512-NEXT: addq %rax, %rdi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX512-NEXT: addq %r8, %rsi -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: cmovoq %rsi, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: addq %r9, %rsi -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addq %r8, %rdi -; AVX512-NEXT: addq %r9, %rax -; AVX512-NEXT: cmovoq %rdi, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vmovq %rcx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vpmovqw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px @@ -15206,1586 +426,59 @@ } define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { -; SSE2-LABEL: v12i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB11_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB11_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB11_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB11_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB11_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB11_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB11_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB11_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: jno .LBB11_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB11_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB11_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB11_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: jno .LBB11_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB11_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: jno .LBB11_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB11_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: jno .LBB11_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB11_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB11_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB11_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB11_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB11_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB11_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB11_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB11_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB11_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB11_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB11_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB11_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB11_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %sil -; SSE2-NEXT: jno .LBB11_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB11_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v12i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB11_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB11_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB11_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB11_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB11_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB11_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB11_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB11_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: jno .LBB11_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB11_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB11_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB11_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: jno .LBB11_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB11_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: jno .LBB11_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB11_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: jno .LBB11_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB11_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB11_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB11_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB11_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB11_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB11_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB11_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB11_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB11_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB11_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB11_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB11_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB11_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %sil -; SSSE3-NEXT: jno .LBB11_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB11_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v12i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB11_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB11_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: jno .LBB11_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB11_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB11_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB11_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: jno .LBB11_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB11_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB11_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB11_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r15b -; SSE41-NEXT: jno .LBB11_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB11_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: jno .LBB11_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB11_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: jno .LBB11_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB11_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB11_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB11_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r9b -; SSE41-NEXT: jno .LBB11_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB11_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB11_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB11_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB11_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB11_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: addb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: jno .LBB11_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: addb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addb %bl, %cl -; SSE41-NEXT: jno .LBB11_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB11_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %dl -; SSE41-NEXT: jno .LBB11_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB11_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r8b -; SSE41-NEXT: jno .LBB11_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB11_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v12i8: +; SSE: # %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v12i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $15, %xmm1, %ecx -; AVX-NEXT: vpextrb $15, %xmm0, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dl -; AVX-NEXT: jno .LBB11_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: .LBB11_2: -; AVX-NEXT: vpextrb $14, %xmm1, %ecx -; AVX-NEXT: vpextrb $14, %xmm0, %r11d -; AVX-NEXT: movl %r11d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r11b -; AVX-NEXT: jno .LBB11_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: .LBB11_4: -; AVX-NEXT: vpextrb $13, %xmm1, %ecx -; AVX-NEXT: vpextrb $13, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dil -; AVX-NEXT: jno .LBB11_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB11_6: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrb $12, %xmm1, %ecx -; AVX-NEXT: vpextrb $12, %xmm0, %r14d -; AVX-NEXT: movl %r14d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r14b -; AVX-NEXT: jno .LBB11_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r14d -; AVX-NEXT: .LBB11_8: -; AVX-NEXT: vpextrb $11, %xmm1, %ecx -; AVX-NEXT: vpextrb $11, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %bpl -; AVX-NEXT: jno .LBB11_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB11_10: -; AVX-NEXT: vpextrb $10, %xmm1, %ecx -; AVX-NEXT: vpextrb $10, %xmm0, %r15d -; AVX-NEXT: movl %r15d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r15b -; AVX-NEXT: jno .LBB11_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r15d -; AVX-NEXT: .LBB11_12: -; AVX-NEXT: vpextrb $9, %xmm1, %ecx -; AVX-NEXT: vpextrb $9, %xmm0, %r12d -; AVX-NEXT: movl %r12d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r12b -; AVX-NEXT: jno .LBB11_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r12d -; AVX-NEXT: .LBB11_14: -; AVX-NEXT: vpextrb $8, %xmm1, %ecx -; AVX-NEXT: vpextrb $8, %xmm0, %r13d -; AVX-NEXT: movl %r13d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r13b -; AVX-NEXT: jno .LBB11_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r13d -; AVX-NEXT: .LBB11_16: -; AVX-NEXT: vpextrb $7, %xmm1, %ecx -; AVX-NEXT: vpextrb $7, %xmm0, %r10d -; AVX-NEXT: movl %r10d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r10b -; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB11_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r10d -; AVX-NEXT: .LBB11_18: -; AVX-NEXT: vpextrb $6, %xmm1, %ecx -; AVX-NEXT: vpextrb $6, %xmm0, %r9d -; AVX-NEXT: movl %r9d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r9b -; AVX-NEXT: jno .LBB11_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r9d -; AVX-NEXT: .LBB11_20: -; AVX-NEXT: vpextrb $5, %xmm1, %ecx -; AVX-NEXT: vpextrb $5, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %bpl -; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB11_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB11_22: -; AVX-NEXT: vpextrb $4, %xmm1, %ecx -; AVX-NEXT: vpextrb $4, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dil -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB11_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB11_24: -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: addb %dl, %cl -; AVX-NEXT: setns %cl -; AVX-NEXT: addb %dl, %al -; AVX-NEXT: jno .LBB11_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: addb $127, %cl -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_26: -; AVX-NEXT: vpextrb $2, %xmm1, %ebx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: addb %bl, %dl -; AVX-NEXT: setns %dl -; AVX-NEXT: addb %bl, %cl -; AVX-NEXT: jno .LBB11_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: addb $127, %dl -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: .LBB11_28: -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: addb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: addb %sil, %dl -; AVX-NEXT: jno .LBB11_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %edx -; AVX-NEXT: .LBB11_30: -; AVX-NEXT: vpextrb $1, %xmm1, %esi -; AVX-NEXT: vpextrb $1, %xmm0, %r8d -; AVX-NEXT: movl %r8d, %ebx -; AVX-NEXT: addb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: addb %sil, %r8b -; AVX-NEXT: jno .LBB11_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %r8d -; AVX-NEXT: .LBB11_32: -; AVX-NEXT: movzbl %dl, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: movzbl %r8b, %edx -; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %bpl, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r9b, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r10b, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r13b, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r12b, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r15b, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r14b, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r11b, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z } define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { -; SSE2-LABEL: v12i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: movdqa 16(%rsi), %xmm3 -; SSE2-NEXT: pextrw $3, %xmm3, %eax -; SSE2-NEXT: pextrw $3, %xmm2, %edx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edx, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %dx -; SSE2-NEXT: cmovol %ecx, %edx -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: pextrw $2, %xmm2, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: movd %xmm2, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: pextrw $1, %xmm3, %eax -; SSE2-NEXT: pextrw $1, %xmm2, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movd %xmm0, %r14d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r14d, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r14w -; SSE2-NEXT: cmovol %ecx, %r14d -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r15d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r15d, %edi -; SSE2-NEXT: addw %ax, %di -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r15w -; SSE2-NEXT: cmovol %esi, %r15d -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %r12d -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %r12d, %ebx -; SSE2-NEXT: addw %ax, %bx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r12w -; SSE2-NEXT: cmovol %edi, %r12d -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %r13d -; SSE2-NEXT: xorl %ebx, %ebx -; SSE2-NEXT: movl %r13d, %ebp -; SSE2-NEXT: addw %ax, %bp -; SSE2-NEXT: setns %bl -; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %r13w -; SSE2-NEXT: cmovol %ebx, %r13d -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %ebx -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: movl %ebx, %ecx -; SSE2-NEXT: addw %ax, %cx -; SSE2-NEXT: setns %bpl -; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %bx -; SSE2-NEXT: cmovol %ebp, %ebx -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: pextrw $5, %xmm0, %ebp -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebp, %esi -; SSE2-NEXT: addw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: addw %ax, %bp -; SSE2-NEXT: cmovol %ecx, %ebp -; SSE2-NEXT: pextrw $6, %xmm1, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: addw %cx, %di -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: addw %cx, %ax -; SSE2-NEXT: cmovol %esi, %eax -; SSE2-NEXT: pextrw $7, %xmm1, %ecx -; SSE2-NEXT: pextrw $7, %xmm0, %esi -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %esi, %r8d -; SSE2-NEXT: addw %cx, %r8w -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE2-NEXT: addw %cx, %si -; SSE2-NEXT: cmovol %edi, %esi -; SSE2-NEXT: movd %esi, %xmm8 -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movd %ebp, %xmm2 -; SSE2-NEXT: movd %ebx, %xmm3 -; SSE2-NEXT: movd %r13d, %xmm4 -; SSE2-NEXT: movd %r12d, %xmm5 -; SSE2-NEXT: movd %r15d, %xmm6 -; SSE2-NEXT: movd %r14d, %xmm7 -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: pinsrw $1, %r11d, %xmm0 -; SSE2-NEXT: pinsrw $2, %r9d, %xmm0 -; SSE2-NEXT: pinsrw $3, %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: movq %xmm0, 16(%rax) -; SSE2-NEXT: movdqa %xmm7, (%rax) -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq +; SSE-LABEL: v12i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddsw (%rsi), %xmm0 +; SSE-NEXT: paddsw 16(%rsi), %xmm1 +; SSE-NEXT: movq %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: retq ; -; SSSE3-LABEL: v12i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSSE3-NEXT: movdqa (%rdi), %xmm0 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 -; SSSE3-NEXT: movdqa (%rsi), %xmm1 -; SSSE3-NEXT: movdqa 16(%rsi), %xmm3 -; SSSE3-NEXT: pextrw $3, %xmm3, %eax -; SSSE3-NEXT: pextrw $3, %xmm2, %edx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edx, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %dx -; SSSE3-NEXT: cmovol %ecx, %edx -; SSSE3-NEXT: pextrw $2, %xmm3, %eax -; SSSE3-NEXT: pextrw $2, %xmm2, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: movd %xmm2, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: pextrw $1, %xmm3, %eax -; SSSE3-NEXT: pextrw $1, %xmm2, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movd %xmm0, %r14d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r14d, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r14w -; SSSE3-NEXT: cmovol %ecx, %r14d -; SSSE3-NEXT: pextrw $1, %xmm1, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r15d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r15d, %edi -; SSSE3-NEXT: addw %ax, %di -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r15w -; SSSE3-NEXT: cmovol %esi, %r15d -; SSSE3-NEXT: pextrw $2, %xmm1, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %r12d -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %r12d, %ebx -; SSSE3-NEXT: addw %ax, %bx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r12w -; SSSE3-NEXT: cmovol %edi, %r12d -; SSSE3-NEXT: pextrw $3, %xmm1, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %r13d -; SSSE3-NEXT: xorl %ebx, %ebx -; SSSE3-NEXT: movl %r13d, %ebp -; SSSE3-NEXT: addw %ax, %bp -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %r13w -; SSSE3-NEXT: cmovol %ebx, %r13d -; SSSE3-NEXT: pextrw $4, %xmm1, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %ebx -; SSSE3-NEXT: xorl %ebp, %ebp -; SSSE3-NEXT: movl %ebx, %ecx -; SSSE3-NEXT: addw %ax, %cx -; SSSE3-NEXT: setns %bpl -; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %bx -; SSSE3-NEXT: cmovol %ebp, %ebx -; SSSE3-NEXT: pextrw $5, %xmm1, %eax -; SSSE3-NEXT: pextrw $5, %xmm0, %ebp -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebp, %esi -; SSSE3-NEXT: addw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: addw %ax, %bp -; SSSE3-NEXT: cmovol %ecx, %ebp -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: addw %cx, %di -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: addw %cx, %ax -; SSSE3-NEXT: cmovol %esi, %eax -; SSSE3-NEXT: pextrw $7, %xmm1, %ecx -; SSSE3-NEXT: pextrw $7, %xmm0, %esi -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %esi, %r8d -; SSSE3-NEXT: addw %cx, %r8w -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSSE3-NEXT: addw %cx, %si -; SSSE3-NEXT: cmovol %edi, %esi -; SSSE3-NEXT: movd %esi, %xmm8 -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movd %ebp, %xmm2 -; SSSE3-NEXT: movd %ebx, %xmm3 -; SSSE3-NEXT: movd %r13d, %xmm4 -; SSSE3-NEXT: movd %r12d, %xmm5 -; SSSE3-NEXT: movd %r15d, %xmm6 -; SSSE3-NEXT: movd %r14d, %xmm7 -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: pinsrw $1, %r11d, %xmm0 -; SSSE3-NEXT: pinsrw $2, %r9d, %xmm0 -; SSSE3-NEXT: pinsrw $3, %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSSE3-NEXT: movq %xmm0, 16(%rax) -; SSSE3-NEXT: movdqa %xmm7, (%rax) -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq +; AVX1-LABEL: v12i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpaddsw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: retq ; -; SSE41-LABEL: v12i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa (%rsi), %xmm1 -; SSE41-NEXT: movdqa 16(%rsi), %xmm3 -; SSE41-NEXT: pextrw $3, %xmm3, %eax -; SSE41-NEXT: pextrw $3, %xmm2, %edx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edx, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %dx -; SSE41-NEXT: cmovol %ecx, %edx -; SSE41-NEXT: pextrw $2, %xmm3, %eax -; SSE41-NEXT: pextrw $2, %xmm2, %r9d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r9d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r9w -; SSE41-NEXT: cmovol %ecx, %r9d -; SSE41-NEXT: movd %xmm3, %eax -; SSE41-NEXT: movd %xmm2, %r10d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r10d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r10w -; SSE41-NEXT: cmovol %ecx, %r10d -; SSE41-NEXT: pextrw $1, %xmm3, %eax -; SSE41-NEXT: pextrw $1, %xmm2, %r11d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r11d, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %r11w -; SSE41-NEXT: cmovol %ecx, %r11d -; SSE41-NEXT: pextrw $7, %xmm1, %ecx -; SSE41-NEXT: pextrw $7, %xmm0, %r14d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r14d, %edi -; SSE41-NEXT: addw %cx, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: addw %cx, %r14w -; SSE41-NEXT: cmovol %esi, %r14d -; SSE41-NEXT: pextrw $6, %xmm1, %esi -; SSE41-NEXT: pextrw $6, %xmm0, %r15d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r15d, %ebx -; SSE41-NEXT: addw %si, %bx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: addw %si, %r15w -; SSE41-NEXT: cmovol %edi, %r15d -; SSE41-NEXT: pextrw $5, %xmm1, %edi -; SSE41-NEXT: pextrw $5, %xmm0, %r12d -; SSE41-NEXT: xorl %ebx, %ebx -; SSE41-NEXT: movl %r12d, %ebp -; SSE41-NEXT: addw %di, %bp -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE41-NEXT: addw %di, %r12w -; SSE41-NEXT: cmovol %ebx, %r12d -; SSE41-NEXT: pextrw $4, %xmm1, %ebx -; SSE41-NEXT: pextrw $4, %xmm0, %r13d -; SSE41-NEXT: xorl %ebp, %ebp -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addw %bx, %ax -; SSE41-NEXT: setns %bpl -; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE41-NEXT: addw %bx, %r13w -; SSE41-NEXT: cmovol %ebp, %r13d -; SSE41-NEXT: pextrw $3, %xmm1, %eax -; SSE41-NEXT: pextrw $3, %xmm0, %ebx -; SSE41-NEXT: xorl %ebp, %ebp -; SSE41-NEXT: movl %ebx, %ecx -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: setns %bpl -; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %bx -; SSE41-NEXT: cmovol %ebp, %ebx -; SSE41-NEXT: pextrw $2, %xmm1, %eax -; SSE41-NEXT: pextrw $2, %xmm0, %ebp -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebp, %esi -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %bp -; SSE41-NEXT: cmovol %ecx, %ebp -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %ecx, %edi -; SSE41-NEXT: addw %ax, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %cx -; SSE41-NEXT: cmovol %esi, %ecx -; SSE41-NEXT: pextrw $1, %xmm1, %eax -; SSE41-NEXT: pextrw $1, %xmm0, %esi -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %esi, %r8d -; SSE41-NEXT: addw %ax, %r8w -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: addw %ax, %si -; SSE41-NEXT: cmovol %edi, %esi -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrw $1, %esi, %xmm0 -; SSE41-NEXT: pinsrw $2, %ebp, %xmm0 -; SSE41-NEXT: pinsrw $3, %ebx, %xmm0 -; SSE41-NEXT: pinsrw $4, %r13d, %xmm0 -; SSE41-NEXT: pinsrw $5, %r12d, %xmm0 -; SSE41-NEXT: pinsrw $6, %r15d, %xmm0 -; SSE41-NEXT: pinsrw $7, %r14d, %xmm0 -; SSE41-NEXT: movd %r10d, %xmm1 -; SSE41-NEXT: pinsrw $1, %r11d, %xmm1 -; SSE41-NEXT: pinsrw $2, %r9d, %xmm1 -; SSE41-NEXT: pinsrw $3, %edx, %xmm1 -; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE41-NEXT: movq %xmm1, 16(%rax) -; SSE41-NEXT: movdqa %xmm0, (%rax) -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; AVX2-LABEL: v12i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddsw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rdx) +; AVX2-NEXT: vmovdqa %xmm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; AVX-LABEL: v12i16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: vmovdqa (%rsi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX-NEXT: vmovd %xmm2, %eax -; AVX-NEXT: vmovdqa (%rdi), %xmm3 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovd %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $1, %xmm2, %eax -; AVX-NEXT: vpextrw $1, %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $2, %xmm2, %eax -; AVX-NEXT: vpextrw $2, %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $3, %xmm2, %eax -; AVX-NEXT: vpextrw $3, %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $4, %xmm2, %eax -; AVX-NEXT: vpextrw $4, %xmm3, %r14d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r14d, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r14w -; AVX-NEXT: cmovol %ecx, %r14d -; AVX-NEXT: vpextrw $5, %xmm2, %eax -; AVX-NEXT: vpextrw $5, %xmm3, %r15d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r15d, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r15w -; AVX-NEXT: cmovol %ecx, %r15d -; AVX-NEXT: vpextrw $6, %xmm2, %eax -; AVX-NEXT: vpextrw $6, %xmm3, %r12d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r12d, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r12w -; AVX-NEXT: cmovol %ecx, %r12d -; AVX-NEXT: vpextrw $7, %xmm2, %eax -; AVX-NEXT: vpextrw $7, %xmm3, %r13d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r13d, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %r13w -; AVX-NEXT: cmovol %ecx, %r13d -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: vpextrw $7, %xmm1, %ebx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %ebx, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %bx -; AVX-NEXT: cmovol %ecx, %ebx -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: vpextrw $6, %xmm1, %ebp -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %ebp, %esi -; AVX-NEXT: addw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: addw %ax, %bp -; AVX-NEXT: cmovol %ecx, %ebp -; AVX-NEXT: vpextrw $5, %xmm0, %ecx -; AVX-NEXT: vpextrw $5, %xmm1, %eax -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: addw %cx, %di -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: addw %cx, %ax -; AVX-NEXT: cmovol %esi, %eax -; AVX-NEXT: vpextrw $4, %xmm0, %esi -; AVX-NEXT: vpextrw $4, %xmm1, %ecx -; AVX-NEXT: xorl %edi, %edi -; AVX-NEXT: movl %ecx, %r8d -; AVX-NEXT: addw %si, %r8w -; AVX-NEXT: setns %dil -; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX-NEXT: addw %si, %cx -; AVX-NEXT: cmovol %edi, %ecx -; AVX-NEXT: vpextrw $3, %xmm0, %edi -; AVX-NEXT: vpextrw $3, %xmm1, %r8d -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %r8d, %edx -; AVX-NEXT: addw %di, %dx -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: addw %di, %r8w -; AVX-NEXT: cmovol %esi, %r8d -; AVX-NEXT: vpextrw $2, %xmm0, %edx -; AVX-NEXT: vpextrw $2, %xmm1, %edi -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %edi, %r9d -; AVX-NEXT: addw %dx, %r9w -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: addw %dx, %di -; AVX-NEXT: cmovol %esi, %edi -; AVX-NEXT: vmovd %xmm0, %r9d -; AVX-NEXT: vmovd %xmm1, %esi -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: movl %esi, %r10d -; AVX-NEXT: addw %r9w, %r10w -; AVX-NEXT: setns %dl -; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX-NEXT: addw %r9w, %si -; AVX-NEXT: cmovol %edx, %esi -; AVX-NEXT: vpextrw $1, %xmm0, %r9d -; AVX-NEXT: vpextrw $1, %xmm1, %edx -; AVX-NEXT: xorl %r10d, %r10d -; AVX-NEXT: movl %edx, %r11d -; AVX-NEXT: addw %r9w, %r11w -; AVX-NEXT: setns %r10b -; AVX-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX-NEXT: addw %r9w, %dx -; AVX-NEXT: cmovol %r10d, %edx -; AVX-NEXT: vmovd %esi, %xmm0 -; AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 -; AVX-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX-NEXT: vpinsrw $4, %r14d, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $5, %r15d, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $6, %r12d, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $7, %r13d, %xmm1, %xmm1 -; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX-NEXT: vmovq %xmm0, 16(%rax) -; AVX-NEXT: vmovdqa %xmm1, (%rax) -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX512-LABEL: v12i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpaddsw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = load <12 x i16>, <12 x i16>* %px %y = load <12 x i16>, <12 x i16>* %py %z = call <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y) @@ -16872,1960 +565,63 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE2-LABEL: v16i4: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB15_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB15_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB15_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB15_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB15_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB15_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB15_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB15_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: jno .LBB15_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB15_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB15_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB15_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: jno .LBB15_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB15_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: jno .LBB15_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB15_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: jno .LBB15_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB15_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB15_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB15_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB15_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB15_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB15_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB15_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB15_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB15_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB15_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB15_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB15_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB15_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %sil -; SSE2-NEXT: jno .LBB15_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB15_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $4, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB15_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB15_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB15_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB15_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB15_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB15_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB15_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB15_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: jno .LBB15_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB15_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB15_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB15_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: jno .LBB15_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB15_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: jno .LBB15_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB15_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: jno .LBB15_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB15_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB15_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB15_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB15_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB15_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB15_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB15_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB15_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB15_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB15_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB15_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB15_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB15_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %sil -; SSSE3-NEXT: jno .LBB15_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB15_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: psubb %xmm1, %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i4: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: psllw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB15_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB15_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: jno .LBB15_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB15_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB15_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB15_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: jno .LBB15_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB15_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB15_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB15_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r15b -; SSE41-NEXT: jno .LBB15_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB15_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: jno .LBB15_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB15_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: jno .LBB15_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB15_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB15_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB15_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r9b -; SSE41-NEXT: jno .LBB15_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB15_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB15_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB15_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB15_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB15_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: addb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: jno .LBB15_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: addb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addb %bl, %cl -; SSE41-NEXT: jno .LBB15_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB15_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %dl -; SSE41-NEXT: jno .LBB15_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB15_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r8b -; SSE41-NEXT: jno .LBB15_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB15_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: psubb %xmm1, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $4, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddsb %xmm1, %xmm0 +; SSE-NEXT: psrlw $4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $15, %xmm1, %ecx ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $15, %xmm0, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dl -; AVX-NEXT: jno .LBB15_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: .LBB15_2: -; AVX-NEXT: vpextrb $14, %xmm1, %ecx -; AVX-NEXT: vpextrb $14, %xmm0, %r11d -; AVX-NEXT: movl %r11d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r11b -; AVX-NEXT: jno .LBB15_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: .LBB15_4: -; AVX-NEXT: vpextrb $13, %xmm1, %ecx -; AVX-NEXT: vpextrb $13, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dil -; AVX-NEXT: jno .LBB15_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB15_6: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrb $12, %xmm1, %ecx -; AVX-NEXT: vpextrb $12, %xmm0, %r14d -; AVX-NEXT: movl %r14d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r14b -; AVX-NEXT: jno .LBB15_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r14d -; AVX-NEXT: .LBB15_8: -; AVX-NEXT: vpextrb $11, %xmm1, %ecx -; AVX-NEXT: vpextrb $11, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %bpl -; AVX-NEXT: jno .LBB15_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB15_10: -; AVX-NEXT: vpextrb $10, %xmm1, %ecx -; AVX-NEXT: vpextrb $10, %xmm0, %r15d -; AVX-NEXT: movl %r15d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r15b -; AVX-NEXT: jno .LBB15_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r15d -; AVX-NEXT: .LBB15_12: -; AVX-NEXT: vpextrb $9, %xmm1, %ecx -; AVX-NEXT: vpextrb $9, %xmm0, %r12d -; AVX-NEXT: movl %r12d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r12b -; AVX-NEXT: jno .LBB15_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r12d -; AVX-NEXT: .LBB15_14: -; AVX-NEXT: vpextrb $8, %xmm1, %ecx -; AVX-NEXT: vpextrb $8, %xmm0, %r13d -; AVX-NEXT: movl %r13d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r13b -; AVX-NEXT: jno .LBB15_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r13d -; AVX-NEXT: .LBB15_16: -; AVX-NEXT: vpextrb $7, %xmm1, %ecx -; AVX-NEXT: vpextrb $7, %xmm0, %r10d -; AVX-NEXT: movl %r10d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r10b -; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB15_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r10d -; AVX-NEXT: .LBB15_18: -; AVX-NEXT: vpextrb $6, %xmm1, %ecx -; AVX-NEXT: vpextrb $6, %xmm0, %r9d -; AVX-NEXT: movl %r9d, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %r9b -; AVX-NEXT: jno .LBB15_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r9d -; AVX-NEXT: .LBB15_20: -; AVX-NEXT: vpextrb $5, %xmm1, %ecx -; AVX-NEXT: vpextrb $5, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %bpl -; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB15_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB15_22: -; AVX-NEXT: vpextrb $4, %xmm1, %ecx -; AVX-NEXT: vpextrb $4, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: addb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: addb %cl, %dil -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB15_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB15_24: -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: addb %dl, %cl -; AVX-NEXT: setns %cl -; AVX-NEXT: addb %dl, %al -; AVX-NEXT: jno .LBB15_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: addb $127, %cl -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_26: -; AVX-NEXT: vpextrb $2, %xmm1, %ebx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: addb %bl, %dl -; AVX-NEXT: setns %dl -; AVX-NEXT: addb %bl, %cl -; AVX-NEXT: jno .LBB15_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: addb $127, %dl -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: .LBB15_28: -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: addb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: addb %sil, %dl -; AVX-NEXT: jno .LBB15_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %edx -; AVX-NEXT: .LBB15_30: -; AVX-NEXT: vpextrb $1, %xmm1, %esi -; AVX-NEXT: vpextrb $1, %xmm0, %r8d -; AVX-NEXT: movl %r8d, %ebx -; AVX-NEXT: addb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: addb %sil, %r8b -; AVX-NEXT: jno .LBB15_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %r8d -; AVX-NEXT: .LBB15_32: -; AVX-NEXT: movzbl %dl, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: movzbl %r8b, %edx -; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %bpl, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r9b, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r10b, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r13b, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r12b, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r15b, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r14b, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r11b, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp ; AVX-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE2-LABEL: v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $7, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r9b -; SSE2-NEXT: jno .LBB16_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB16_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %sil -; SSE2-NEXT: jno .LBB16_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB16_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB16_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB16_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dl -; SSE2-NEXT: jno .LBB16_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB16_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r10b -; SSE2-NEXT: jno .LBB16_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB16_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r11b -; SSE2-NEXT: jno .LBB16_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB16_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bpl -; SSE2-NEXT: jno .LBB16_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB16_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r14b -; SSE2-NEXT: jno .LBB16_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB16_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r15b -; SSE2-NEXT: jno .LBB16_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB16_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r12b -; SSE2-NEXT: jno .LBB16_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB16_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB16_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB16_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %dil -; SSE2-NEXT: jno .LBB16_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB16_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %r8b -; SSE2-NEXT: jno .LBB16_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB16_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: addb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: addb %cl, %bl -; SSE2-NEXT: jno .LBB16_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB16_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB16_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB16_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: addb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addb %dl, %sil -; SSE2-NEXT: jno .LBB16_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB16_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $7, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $7, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r9b -; SSSE3-NEXT: jno .LBB16_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB16_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %sil -; SSSE3-NEXT: jno .LBB16_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB16_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB16_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB16_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dl -; SSSE3-NEXT: jno .LBB16_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB16_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r10b -; SSSE3-NEXT: jno .LBB16_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB16_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r11b -; SSSE3-NEXT: jno .LBB16_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB16_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bpl -; SSSE3-NEXT: jno .LBB16_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB16_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r14b -; SSSE3-NEXT: jno .LBB16_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB16_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r15b -; SSSE3-NEXT: jno .LBB16_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB16_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r12b -; SSSE3-NEXT: jno .LBB16_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB16_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB16_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB16_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %dil -; SSSE3-NEXT: jno .LBB16_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB16_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %r8b -; SSSE3-NEXT: jno .LBB16_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB16_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: addb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addb %cl, %bl -; SSSE3-NEXT: jno .LBB16_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB16_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB16_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB16_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: addb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addb %dl, %sil -; SSSE3-NEXT: jno .LBB16_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB16_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pcmpgtb %xmm4, %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $7, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dl -; SSE41-NEXT: jno .LBB16_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB16_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r11b -; SSE41-NEXT: jno .LBB16_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB16_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: jno .LBB16_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB16_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r14b -; SSE41-NEXT: jno .LBB16_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB16_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: jno .LBB16_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB16_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r15b -; SSE41-NEXT: jno .LBB16_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB16_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r12b -; SSE41-NEXT: jno .LBB16_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB16_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r13b -; SSE41-NEXT: jno .LBB16_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB16_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB16_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB16_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %r9b -; SSE41-NEXT: jno .LBB16_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB16_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB16_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB16_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: addb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB16_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB16_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: addb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addb %dl, %al -; SSE41-NEXT: jno .LBB16_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: addb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addb %bl, %cl -; SSE41-NEXT: jno .LBB16_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB16_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %dl -; SSE41-NEXT: jno .LBB16_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB16_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: addb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addb %sil, %r8b -; SSE41-NEXT: jno .LBB16_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB16_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm1 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm1 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm1 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm1 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm1 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm1 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm1 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm1 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm1 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm1 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddsb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: jno .LBB16_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB16_2: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r11b -; AVX1-NEXT: jno .LBB16_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB16_4: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: jno .LBB16_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB16_6: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r14b -; AVX1-NEXT: jno .LBB16_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB16_8: -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: jno .LBB16_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB16_10: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r15b -; AVX1-NEXT: jno .LBB16_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB16_12: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r12b -; AVX1-NEXT: jno .LBB16_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB16_14: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r13b -; AVX1-NEXT: jno .LBB16_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB16_16: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r10b -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB16_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB16_18: -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %r9b -; AVX1-NEXT: jno .LBB16_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: .LBB16_20: -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %bpl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB16_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB16_22: -; AVX1-NEXT: vpextrb $4, %xmm1, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: addb %cl, %dil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB16_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB16_24: -; AVX1-NEXT: vpextrb $3, %xmm1, %edx -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addb %dl, %al -; AVX1-NEXT: jno .LBB16_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: addb $127, %cl -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_26: -; AVX1-NEXT: vpextrb $2, %xmm1, %ebx -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: addb %bl, %dl -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addb %bl, %cl -; AVX1-NEXT: jno .LBB16_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: addb $127, %dl -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: .LBB16_28: -; AVX1-NEXT: vpextrb $0, %xmm1, %esi -; AVX1-NEXT: vpextrb $0, %xmm0, %edx -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %dl -; AVX1-NEXT: jno .LBB16_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: .LBB16_30: -; AVX1-NEXT: vpextrb $1, %xmm1, %esi -; AVX1-NEXT: vpextrb $1, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %ebx -; AVX1-NEXT: addb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addb %sil, %r8b -; AVX1-NEXT: jno .LBB16_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r8d -; AVX1-NEXT: .LBB16_32: -; AVX1-NEXT: movzbl %dl, %edx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: movzbl %r8b, %edx -; AVX1-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r9b, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r10b, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r11b, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: @@ -18833,604 +629,21 @@ ; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx ; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: jno .LBB16_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB16_2: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r11b -; AVX2-NEXT: jno .LBB16_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB16_4: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: jno .LBB16_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB16_6: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r14b -; AVX2-NEXT: jno .LBB16_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB16_8: -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: jno .LBB16_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB16_10: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r15b -; AVX2-NEXT: jno .LBB16_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB16_12: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r12b -; AVX2-NEXT: jno .LBB16_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB16_14: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r13b -; AVX2-NEXT: jno .LBB16_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB16_16: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r10b -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB16_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB16_18: -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %r9b -; AVX2-NEXT: jno .LBB16_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: .LBB16_20: -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %bpl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB16_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB16_22: -; AVX2-NEXT: vpextrb $4, %xmm1, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: addb %cl, %dil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB16_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB16_24: -; AVX2-NEXT: vpextrb $3, %xmm1, %edx -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addb %dl, %al -; AVX2-NEXT: jno .LBB16_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: addb $127, %cl -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_26: -; AVX2-NEXT: vpextrb $2, %xmm1, %ebx -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: addb %bl, %dl -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addb %bl, %cl -; AVX2-NEXT: jno .LBB16_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: addb $127, %dl -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: .LBB16_28: -; AVX2-NEXT: vpextrb $0, %xmm1, %esi -; AVX2-NEXT: vpextrb $0, %xmm0, %edx -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %dl -; AVX2-NEXT: jno .LBB16_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: .LBB16_30: -; AVX2-NEXT: vpextrb $1, %xmm1, %esi -; AVX2-NEXT: vpextrb $1, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %ebx -; AVX2-NEXT: addb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addb %sil, %r8b -; AVX2-NEXT: jno .LBB16_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r8d -; AVX2-NEXT: .LBB16_32: -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: movzbl %r8b, %edx -; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r9b, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: ; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k0 ; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k1 -; AVX512-NEXT: kmovd %k1, %edx -; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 ; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: movl %edx, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %dl -; AVX512-NEXT: kmovd %k0, %esi -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: jno .LBB16_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB16_2: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %sil -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %esi -; AVX512-NEXT: .LBB16_4: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: movl %edi, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %dil -; AVX512-NEXT: kshiftrw $3, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r11d -; AVX512-NEXT: kshiftrw $3, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %edi -; AVX512-NEXT: .LBB16_6: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: movl %r11d, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %r11b -; AVX512-NEXT: kshiftrw $4, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r14d -; AVX512-NEXT: kshiftrw $4, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r11d -; AVX512-NEXT: .LBB16_8: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r14b -; AVX512-NEXT: movl %r14d, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %r14b -; AVX512-NEXT: kshiftrw $5, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r15d -; AVX512-NEXT: kshiftrw $5, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r14d -; AVX512-NEXT: .LBB16_10: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r15b -; AVX512-NEXT: movl %r15d, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %r15b -; AVX512-NEXT: kshiftrw $6, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r12d -; AVX512-NEXT: kshiftrw $6, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r15d -; AVX512-NEXT: .LBB16_12: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r12b -; AVX512-NEXT: movl %r12d, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %r12b -; AVX512-NEXT: kshiftrw $7, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r13d -; AVX512-NEXT: kshiftrw $7, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r12d -; AVX512-NEXT: .LBB16_14: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r13b -; AVX512-NEXT: movl %r13d, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %r13b -; AVX512-NEXT: kshiftrw $8, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r9d -; AVX512-NEXT: kshiftrw $8, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r13d -; AVX512-NEXT: .LBB16_16: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: movl %r9d, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %r9b -; AVX512-NEXT: kshiftrw $9, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r10d -; AVX512-NEXT: kshiftrw $9, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: .LBB16_18: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r10b -; AVX512-NEXT: movl %r10d, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %r10b -; AVX512-NEXT: kshiftrw $10, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %ebp -; AVX512-NEXT: kshiftrw $10, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB16_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r10d -; AVX512-NEXT: .LBB16_20: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %bpl -; AVX512-NEXT: movl %ebp, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addb %al, %bpl -; AVX512-NEXT: kshiftrw $11, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kshiftrw $11, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB16_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %ebp -; AVX512-NEXT: .LBB16_22: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: movl %edi, %ecx -; AVX512-NEXT: addb %al, %cl -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addb %al, %dil -; AVX512-NEXT: kshiftrw $12, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: kshiftrw $12, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %ecx -; AVX512-NEXT: jno .LBB16_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: addb $127, %dl -; AVX512-NEXT: movl %edx, %edi -; AVX512-NEXT: .LBB16_24: -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: addb %cl, %dl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: kshiftrw $13, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %ecx -; AVX512-NEXT: kshiftrw $13, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB16_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: .LBB16_26: -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: movl %ecx, %ebx -; AVX512-NEXT: addb %dl, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %dl, %cl -; AVX512-NEXT: kshiftrw $14, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: kshiftrw $14, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %esi -; AVX512-NEXT: jno .LBB16_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %ecx -; AVX512-NEXT: .LBB16_28: -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %dl -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kmovd %k0, %r8d -; AVX512-NEXT: kshiftrw $15, %k1, %k0 -; AVX512-NEXT: kmovd %k0, %esi -; AVX512-NEXT: jno .LBB16_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: .LBB16_30: -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: shlb $7, %r8b -; AVX512-NEXT: movl %r8d, %ebx -; AVX512-NEXT: addb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addb %sil, %r8b -; AVX512-NEXT: jno .LBB16_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r8d -; AVX512-NEXT: .LBB16_32: -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; AVX512-NEXT: sarb $7, %sil -; AVX512-NEXT: kmovd %esi, %k1 -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; AVX512-NEXT: sarb $7, %sil -; AVX512-NEXT: kmovd %esi, %k0 -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; AVX512-NEXT: sarb $7, %sil -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: sarb $7, %r11b -; AVX512-NEXT: kmovd %r11d, %k3 -; AVX512-NEXT: sarb $7, %r14b -; AVX512-NEXT: kmovd %r14d, %k4 -; AVX512-NEXT: sarb $7, %r15b -; AVX512-NEXT: kmovd %r15d, %k5 -; AVX512-NEXT: sarb $7, %r12b -; AVX512-NEXT: kmovd %r12d, %k6 -; AVX512-NEXT: kshiftrw $1, %k0, %k7 -; AVX512-NEXT: kxorw %k1, %k7, %k7 -; AVX512-NEXT: sarb $7, %r13b -; AVX512-NEXT: kmovd %r13d, %k1 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $14, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k0, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k7 -; AVX512-NEXT: kxorw %k2, %k7, %k7 -; AVX512-NEXT: sarb $7, %r9b -; AVX512-NEXT: kmovd %r9d, %k2 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $13, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k0, %k0 -; AVX512-NEXT: kshiftrw $3, %k0, %k7 -; AVX512-NEXT: kxorw %k3, %k7, %k7 -; AVX512-NEXT: sarb $7, %r10b -; AVX512-NEXT: kmovd %r10d, %k3 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $12, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k0, %k7 -; AVX512-NEXT: kshiftrw $4, %k7, %k0 -; AVX512-NEXT: kxorw %k4, %k0, %k4 -; AVX512-NEXT: sarb $7, %bpl -; AVX512-NEXT: kmovd %ebp, %k0 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $11, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k7 -; AVX512-NEXT: kshiftrw $5, %k7, %k4 -; AVX512-NEXT: kxorw %k5, %k4, %k5 -; AVX512-NEXT: sarb $7, %dil -; AVX512-NEXT: kmovd %edi, %k4 -; AVX512-NEXT: kshiftlw $15, %k5, %k5 -; AVX512-NEXT: kshiftrw $10, %k5, %k5 -; AVX512-NEXT: kxorw %k5, %k7, %k7 -; AVX512-NEXT: kshiftrw $6, %k7, %k5 -; AVX512-NEXT: kxorw %k6, %k5, %k6 -; AVX512-NEXT: sarb $7, %al -; AVX512-NEXT: kmovd %eax, %k5 -; AVX512-NEXT: kshiftlw $15, %k6, %k6 -; AVX512-NEXT: kshiftrw $9, %k6, %k6 -; AVX512-NEXT: kxorw %k6, %k7, %k6 -; AVX512-NEXT: kshiftrw $7, %k6, %k7 -; AVX512-NEXT: kxorw %k1, %k7, %k7 -; AVX512-NEXT: sarb $7, %cl -; AVX512-NEXT: kmovd %ecx, %k1 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $8, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k6, %k6 -; AVX512-NEXT: kshiftrw $8, %k6, %k7 -; AVX512-NEXT: kxorw %k2, %k7, %k7 -; AVX512-NEXT: sarb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $7, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k6, %k6 -; AVX512-NEXT: kshiftrw $9, %k6, %k7 -; AVX512-NEXT: kxorw %k3, %k7, %k3 -; AVX512-NEXT: sarb $7, %r8b -; AVX512-NEXT: kmovd %r8d, %k7 -; AVX512-NEXT: kshiftlw $15, %k3, %k3 -; AVX512-NEXT: kshiftrw $6, %k3, %k3 -; AVX512-NEXT: kxorw %k3, %k6, %k3 -; AVX512-NEXT: kshiftrw $10, %k3, %k6 -; AVX512-NEXT: kxorw %k0, %k6, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $5, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k3, %k0 -; AVX512-NEXT: kshiftrw $11, %k0, %k3 -; AVX512-NEXT: kxorw %k4, %k3, %k3 -; AVX512-NEXT: kshiftlw $15, %k3, %k3 -; AVX512-NEXT: kshiftrw $4, %k3, %k3 -; AVX512-NEXT: kxorw %k3, %k0, %k0 -; AVX512-NEXT: kshiftrw $12, %k0, %k3 -; AVX512-NEXT: kxorw %k5, %k3, %k3 -; AVX512-NEXT: kshiftlw $15, %k3, %k3 -; AVX512-NEXT: kshiftrw $3, %k3, %k3 -; AVX512-NEXT: kxorw %k3, %k0, %k0 -; AVX512-NEXT: kshiftrw $13, %k0, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k1 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 -; AVX512-NEXT: kshiftlw $15, %k7, %k1 -; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %z = call <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z Index: llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll +++ llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll @@ -34,13453 +34,160 @@ ; Legal types, depending on architecture. define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { -; SSE2-LABEL: v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB0_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB0_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB0_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB0_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB0_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB0_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB0_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: jno .LBB0_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB0_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB0_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB0_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: jno .LBB0_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB0_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: jno .LBB0_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB0_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: jno .LBB0_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB0_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB0_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB0_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB0_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB0_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB0_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB0_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB0_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB0_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB0_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB0_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB0_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB0_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %sil -; SSE2-NEXT: jno .LBB0_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB0_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB0_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB0_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB0_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB0_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB0_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB0_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB0_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB0_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: jno .LBB0_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB0_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB0_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB0_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: jno .LBB0_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB0_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: jno .LBB0_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB0_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: jno .LBB0_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB0_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB0_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB0_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB0_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB0_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB0_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB0_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB0_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB0_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB0_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB0_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB0_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB0_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %sil -; SSSE3-NEXT: jno .LBB0_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB0_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB0_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB0_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: jno .LBB0_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB0_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB0_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB0_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: jno .LBB0_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB0_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB0_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB0_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r15b -; SSE41-NEXT: jno .LBB0_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB0_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: jno .LBB0_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB0_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: jno .LBB0_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB0_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB0_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB0_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r9b -; SSE41-NEXT: jno .LBB0_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB0_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB0_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB0_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB0_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB0_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: jno .LBB0_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB0_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: subb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: subb %bl, %cl -; SSE41-NEXT: jno .LBB0_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB0_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: jno .LBB0_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB0_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r8b -; SSE41-NEXT: jno .LBB0_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB0_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $15, %xmm1, %ecx -; AVX-NEXT: vpextrb $15, %xmm0, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dl -; AVX-NEXT: jno .LBB0_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: .LBB0_2: -; AVX-NEXT: vpextrb $14, %xmm1, %ecx -; AVX-NEXT: vpextrb $14, %xmm0, %r11d -; AVX-NEXT: movl %r11d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r11b -; AVX-NEXT: jno .LBB0_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: .LBB0_4: -; AVX-NEXT: vpextrb $13, %xmm1, %ecx -; AVX-NEXT: vpextrb $13, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dil -; AVX-NEXT: jno .LBB0_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB0_6: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrb $12, %xmm1, %ecx -; AVX-NEXT: vpextrb $12, %xmm0, %r14d -; AVX-NEXT: movl %r14d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r14b -; AVX-NEXT: jno .LBB0_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r14d -; AVX-NEXT: .LBB0_8: -; AVX-NEXT: vpextrb $11, %xmm1, %ecx -; AVX-NEXT: vpextrb $11, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %bpl -; AVX-NEXT: jno .LBB0_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB0_10: -; AVX-NEXT: vpextrb $10, %xmm1, %ecx -; AVX-NEXT: vpextrb $10, %xmm0, %r15d -; AVX-NEXT: movl %r15d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r15b -; AVX-NEXT: jno .LBB0_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r15d -; AVX-NEXT: .LBB0_12: -; AVX-NEXT: vpextrb $9, %xmm1, %ecx -; AVX-NEXT: vpextrb $9, %xmm0, %r12d -; AVX-NEXT: movl %r12d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r12b -; AVX-NEXT: jno .LBB0_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r12d -; AVX-NEXT: .LBB0_14: -; AVX-NEXT: vpextrb $8, %xmm1, %ecx -; AVX-NEXT: vpextrb $8, %xmm0, %r13d -; AVX-NEXT: movl %r13d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r13b -; AVX-NEXT: jno .LBB0_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r13d -; AVX-NEXT: .LBB0_16: -; AVX-NEXT: vpextrb $7, %xmm1, %ecx -; AVX-NEXT: vpextrb $7, %xmm0, %r10d -; AVX-NEXT: movl %r10d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r10b -; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB0_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r10d -; AVX-NEXT: .LBB0_18: -; AVX-NEXT: vpextrb $6, %xmm1, %ecx -; AVX-NEXT: vpextrb $6, %xmm0, %r9d -; AVX-NEXT: movl %r9d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r9b -; AVX-NEXT: jno .LBB0_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r9d -; AVX-NEXT: .LBB0_20: -; AVX-NEXT: vpextrb $5, %xmm1, %ecx -; AVX-NEXT: vpextrb $5, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %bpl -; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB0_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB0_22: -; AVX-NEXT: vpextrb $4, %xmm1, %ecx -; AVX-NEXT: vpextrb $4, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dil -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB0_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB0_24: -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: setns %cl -; AVX-NEXT: subb %dl, %al -; AVX-NEXT: jno .LBB0_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: addb $127, %cl -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB0_26: -; AVX-NEXT: vpextrb $2, %xmm1, %ebx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: subb %bl, %dl -; AVX-NEXT: setns %dl -; AVX-NEXT: subb %bl, %cl -; AVX-NEXT: jno .LBB0_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: addb $127, %dl -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: .LBB0_28: -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: subb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: subb %sil, %dl -; AVX-NEXT: jno .LBB0_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %edx -; AVX-NEXT: .LBB0_30: -; AVX-NEXT: vpextrb $1, %xmm1, %esi -; AVX-NEXT: vpextrb $1, %xmm0, %r8d -; AVX-NEXT: movl %r8d, %ebx -; AVX-NEXT: subb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: subb %sil, %r8b -; AVX-NEXT: jno .LBB0_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %r8d -; AVX-NEXT: .LBB0_32: -; AVX-NEXT: movzbl %dl, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: movzbl %r8b, %edx -; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %bpl, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r9b, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r10b, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r13b, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r12b, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r15b, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r14b, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r11b, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %z } define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { -; SSE2-LABEL: v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB1_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB1_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB1_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB1_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB1_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB1_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB1_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jo .LBB1_9 -; SSE2-NEXT: # %bb.10: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB1_11 -; SSE2-NEXT: .LBB1_9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_11: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_13 -; SSE2-NEXT: # %bb.12: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB1_13: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB1_15 -; SSE2-NEXT: # %bb.14: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB1_15: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_17 -; SSE2-NEXT: # %bb.16: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB1_17: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: jno .LBB1_19 -; SSE2-NEXT: # %bb.18: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB1_19: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: jno .LBB1_21 -; SSE2-NEXT: # %bb.20: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB1_21: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB1_23 -; SSE2-NEXT: # %bb.22: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB1_23: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB1_25 -; SSE2-NEXT: # %bb.24: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_25: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: jno .LBB1_27 -; SSE2-NEXT: # %bb.26: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB1_27: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB1_29 -; SSE2-NEXT: # %bb.28: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB1_29: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: jno .LBB1_31 -; SSE2-NEXT: # %bb.30: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB1_31: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_33 -; SSE2-NEXT: # %bb.32: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_33: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB1_35 -; SSE2-NEXT: # %bb.34: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB1_35: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_37 -; SSE2-NEXT: # %bb.36: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_37: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB1_39 -; SSE2-NEXT: # %bb.38: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB1_39: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_41 -; SSE2-NEXT: # %bb.40: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB1_41: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_43 -; SSE2-NEXT: # %bb.42: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB1_43: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_45 -; SSE2-NEXT: # %bb.44: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB1_45: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_47 -; SSE2-NEXT: # %bb.46: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB1_47: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_49 -; SSE2-NEXT: # %bb.48: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB1_49: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_51 -; SSE2-NEXT: # %bb.50: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB1_51: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_53 -; SSE2-NEXT: # %bb.52: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB1_53: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_55 -; SSE2-NEXT: # %bb.54: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB1_55: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_57 -; SSE2-NEXT: # %bb.56: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB1_57: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_59 -; SSE2-NEXT: # %bb.58: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB1_59: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %r11b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB1_61 -; SSE2-NEXT: # %bb.60: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %r11d -; SSE2-NEXT: .LBB1_61: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: subb %bl, %dl -; SSE2-NEXT: setns %dl -; SSE2-NEXT: subb %bl, %cl -; SSE2-NEXT: jno .LBB1_63 -; SSE2-NEXT: # %bb.62: -; SSE2-NEXT: addb $127, %dl -; SSE2-NEXT: movl %edx, %ecx -; SSE2-NEXT: .LBB1_63: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: subb %al, %dl -; SSE2-NEXT: setns %dl -; SSE2-NEXT: subb %al, %bl -; SSE2-NEXT: jno .LBB1_65 -; SSE2-NEXT: # %bb.64: -; SSE2-NEXT: addb $127, %dl -; SSE2-NEXT: movl %edx, %ebx -; SSE2-NEXT: .LBB1_65: -; SSE2-NEXT: movzbl %bl, %esi -; SSE2-NEXT: movzbl %cl, %edi -; SSE2-NEXT: movzbl %r11b, %r11d -; SSE2-NEXT: movzbl %r10b, %r10d -; SSE2-NEXT: movzbl %r9b, %r9d -; SSE2-NEXT: movzbl %r8b, %r8d -; SSE2-NEXT: movzbl %r13b, %r13d -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movzbl %r15b, %ebx -; SSE2-NEXT: movzbl %r14b, %edx -; SSE2-NEXT: movzbl %bpl, %ebp -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movd %esi, %xmm12 -; SSE2-NEXT: movd %edi, %xmm6 -; SSE2-NEXT: movd %r11d, %xmm11 -; SSE2-NEXT: movd %r10d, %xmm2 -; SSE2-NEXT: movd %r9d, %xmm10 -; SSE2-NEXT: movd %r8d, %xmm5 -; SSE2-NEXT: movd %r13d, %xmm9 -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movd %ebx, %xmm8 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movd %edx, %xmm14 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movd %ebp, %xmm13 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: movd %ecx, %xmm7 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %r12d, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movd %r15d, %xmm4 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: movd %r14d, %xmm15 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movd %r13d, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: movd %r11d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: movd %r8d, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movd %r9d, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: movd %r10d, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE2-NEXT: movd %ebx, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movd %r12d, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; SSE2-NEXT: movd %esi, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: movd %r15d, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE2-NEXT: movd %r14d, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: movd %edx, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movd %r13d, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movd %r11d, %xmm2 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE2-NEXT: addq $8, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: pushq %rax -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB1_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB1_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB1_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB1_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB1_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB1_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB1_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jo .LBB1_9 -; SSSE3-NEXT: # %bb.10: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB1_11 -; SSSE3-NEXT: .LBB1_9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_11: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_13 -; SSSE3-NEXT: # %bb.12: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB1_13: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB1_15 -; SSSE3-NEXT: # %bb.14: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB1_15: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_17 -; SSSE3-NEXT: # %bb.16: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB1_17: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: jno .LBB1_19 -; SSSE3-NEXT: # %bb.18: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB1_19: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: jno .LBB1_21 -; SSSE3-NEXT: # %bb.20: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB1_21: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB1_23 -; SSSE3-NEXT: # %bb.22: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB1_23: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB1_25 -; SSSE3-NEXT: # %bb.24: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_25: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: jno .LBB1_27 -; SSSE3-NEXT: # %bb.26: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB1_27: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB1_29 -; SSSE3-NEXT: # %bb.28: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB1_29: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: jno .LBB1_31 -; SSSE3-NEXT: # %bb.30: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB1_31: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_33 -; SSSE3-NEXT: # %bb.32: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_33: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB1_35 -; SSSE3-NEXT: # %bb.34: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB1_35: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_37 -; SSSE3-NEXT: # %bb.36: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_37: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB1_39 -; SSSE3-NEXT: # %bb.38: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB1_39: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_41 -; SSSE3-NEXT: # %bb.40: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB1_41: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_43 -; SSSE3-NEXT: # %bb.42: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB1_43: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_45 -; SSSE3-NEXT: # %bb.44: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB1_45: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_47 -; SSSE3-NEXT: # %bb.46: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB1_47: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_49 -; SSSE3-NEXT: # %bb.48: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB1_49: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_51 -; SSSE3-NEXT: # %bb.50: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB1_51: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_53 -; SSSE3-NEXT: # %bb.52: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB1_53: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_55 -; SSSE3-NEXT: # %bb.54: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB1_55: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_57 -; SSSE3-NEXT: # %bb.56: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB1_57: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_59 -; SSSE3-NEXT: # %bb.58: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB1_59: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %r11b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB1_61 -; SSSE3-NEXT: # %bb.60: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %r11d -; SSSE3-NEXT: .LBB1_61: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %ecx, %edx -; SSSE3-NEXT: subb %bl, %dl -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: subb %bl, %cl -; SSSE3-NEXT: jno .LBB1_63 -; SSSE3-NEXT: # %bb.62: -; SSSE3-NEXT: addb $127, %dl -; SSSE3-NEXT: movl %edx, %ecx -; SSSE3-NEXT: .LBB1_63: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: subb %al, %dl -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: subb %al, %bl -; SSSE3-NEXT: jno .LBB1_65 -; SSSE3-NEXT: # %bb.64: -; SSSE3-NEXT: addb $127, %dl -; SSSE3-NEXT: movl %edx, %ebx -; SSSE3-NEXT: .LBB1_65: -; SSSE3-NEXT: movzbl %bl, %esi -; SSSE3-NEXT: movzbl %cl, %edi -; SSSE3-NEXT: movzbl %r11b, %r11d -; SSSE3-NEXT: movzbl %r10b, %r10d -; SSSE3-NEXT: movzbl %r9b, %r9d -; SSSE3-NEXT: movzbl %r8b, %r8d -; SSSE3-NEXT: movzbl %r13b, %r13d -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movzbl %r15b, %ebx -; SSSE3-NEXT: movzbl %r14b, %edx -; SSSE3-NEXT: movzbl %bpl, %ebp -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movd %esi, %xmm12 -; SSSE3-NEXT: movd %edi, %xmm6 -; SSSE3-NEXT: movd %r11d, %xmm11 -; SSSE3-NEXT: movd %r10d, %xmm2 -; SSSE3-NEXT: movd %r9d, %xmm10 -; SSSE3-NEXT: movd %r8d, %xmm5 -; SSSE3-NEXT: movd %r13d, %xmm9 -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movd %ebx, %xmm8 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movd %edx, %xmm14 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movd %ebp, %xmm13 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: movd %ecx, %xmm7 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %r12d, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movd %r15d, %xmm4 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: movd %r14d, %xmm15 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movd %r13d, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSSE3-NEXT: movd %r11d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSSE3-NEXT: movd %r8d, %xmm11 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSSE3-NEXT: movd %r9d, %xmm12 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSSE3-NEXT: movd %r10d, %xmm10 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSSE3-NEXT: movd %ebx, %xmm9 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSSE3-NEXT: movd %eax, %xmm5 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movd %r12d, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; SSSE3-NEXT: movd %esi, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSSE3-NEXT: movd %r15d, %xmm13 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSSE3-NEXT: movd %ebp, %xmm3 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSSE3-NEXT: movd %r14d, %xmm14 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSSE3-NEXT: movd %edx, %xmm15 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSSE3-NEXT: movd %r13d, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movd %r11d, %xmm2 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSSE3-NEXT: addq $8, %rsp -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm3, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB1_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB1_2: -; SSE41-NEXT: pextrb $14, %xmm3, %ecx -; SSE41-NEXT: pextrb $14, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: jno .LBB1_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_4: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $13, %xmm3, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jo .LBB1_5 -; SSE41-NEXT: # %bb.6: -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB1_7 -; SSE41-NEXT: .LBB1_5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB1_7: -; SSE41-NEXT: pextrb $12, %xmm3, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB1_9 -; SSE41-NEXT: # %bb.8: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_9: -; SSE41-NEXT: pextrb $11, %xmm3, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB1_11 -; SSE41-NEXT: # %bb.10: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_11: -; SSE41-NEXT: pextrb $10, %xmm3, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_13 -; SSE41-NEXT: # %bb.12: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB1_13: -; SSE41-NEXT: pextrb $9, %xmm3, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_15 -; SSE41-NEXT: # %bb.14: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_15: -; SSE41-NEXT: pextrb $8, %xmm3, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_17 -; SSE41-NEXT: # %bb.16: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_17: -; SSE41-NEXT: pextrb $7, %xmm3, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_19 -; SSE41-NEXT: # %bb.18: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB1_19: -; SSE41-NEXT: pextrb $6, %xmm3, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_21 -; SSE41-NEXT: # %bb.20: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_21: -; SSE41-NEXT: pextrb $5, %xmm3, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %r8d -; SSE41-NEXT: movl %r8d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r8b -; SSE41-NEXT: jno .LBB1_23 -; SSE41-NEXT: # %bb.22: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB1_23: -; SSE41-NEXT: pextrb $4, %xmm3, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: jno .LBB1_25 -; SSE41-NEXT: # %bb.24: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB1_25: -; SSE41-NEXT: pextrb $3, %xmm3, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: jno .LBB1_27 -; SSE41-NEXT: # %bb.26: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB1_27: -; SSE41-NEXT: pextrb $2, %xmm3, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: jno .LBB1_29 -; SSE41-NEXT: # %bb.28: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB1_29: -; SSE41-NEXT: pextrb $0, %xmm3, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r9b -; SSE41-NEXT: jno .LBB1_31 -; SSE41-NEXT: # %bb.30: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB1_31: -; SSE41-NEXT: pextrb $1, %xmm3, %ecx -; SSE41-NEXT: pextrb $1, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB1_33 -; SSE41-NEXT: # %bb.32: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_33: -; SSE41-NEXT: pextrb $15, %xmm2, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %esi, %r15d -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: jno .LBB1_35 -; SSE41-NEXT: # %bb.34: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB1_35: -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrb $14, %xmm2, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %edx, %edi -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: jno .LBB1_37 -; SSE41-NEXT: # %bb.36: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB1_37: -; SSE41-NEXT: pextrb $13, %xmm2, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB1_38 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB1_40 -; SSE41-NEXT: .LBB1_38: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB1_40: -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: pextrb $12, %xmm2, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: movl %r15d, %esi -; SSE41-NEXT: jno .LBB1_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_42: -; SSE41-NEXT: pextrb $11, %xmm2, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r15b -; SSE41-NEXT: jno .LBB1_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB1_44: -; SSE41-NEXT: pextrb $10, %xmm2, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: jno .LBB1_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB1_46: -; SSE41-NEXT: pextrb $9, %xmm2, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB1_48: -; SSE41-NEXT: pextrb $8, %xmm2, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB1_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB1_50: -; SSE41-NEXT: pextrb $7, %xmm2, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: jno .LBB1_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB1_52: -; SSE41-NEXT: pextrb $6, %xmm2, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %edi, %r14d -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB1_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB1_54: -; SSE41-NEXT: pextrb $5, %xmm2, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB1_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB1_56: -; SSE41-NEXT: pextrb $4, %xmm2, %edx -; SSE41-NEXT: pextrb $4, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: jno .LBB1_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB1_58: -; SSE41-NEXT: pextrb $3, %xmm2, %ebx -; SSE41-NEXT: pextrb $3, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: subb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: subb %bl, %cl -; SSE41-NEXT: jno .LBB1_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB1_60: -; SSE41-NEXT: pextrb $2, %xmm2, %esi -; SSE41-NEXT: pextrb $2, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: jno .LBB1_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB1_62: -; SSE41-NEXT: pextrb $0, %xmm2, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r8b -; SSE41-NEXT: jno .LBB1_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB1_64: -; SSE41-NEXT: pextrb $1, %xmm2, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r9b -; SSE41-NEXT: jno .LBB1_66 -; SSE41-NEXT: # %bb.65: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r9d -; SSE41-NEXT: .LBB1_66: -; SSE41-NEXT: movzbl %r8b, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: movzbl %r9b, %esi -; SSE41-NEXT: pinsrb $1, %esi, %xmm0 -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: pinsrb $2, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v32i8: +; SSE: # %bb.0: +; SSE-NEXT: psubsb %xmm2, %xmm0 +; SSE-NEXT: psubsb %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: jo .LBB1_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB1_3 -; AVX1-NEXT: .LBB1_1: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB1_3: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: jno .LBB1_5 -; AVX1-NEXT: # %bb.4: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB1_5: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: jo .LBB1_6 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB1_8 -; AVX1-NEXT: .LBB1_6: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB1_8: -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: jno .LBB1_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_10: -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB1_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB1_12: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB1_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB1_14: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB1_16: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_18: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB1_20: -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB1_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB1_22: -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB1_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB1_24: -; AVX1-NEXT: vpextrb $4, %xmm1, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r11b -; AVX1-NEXT: jno .LBB1_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB1_26: -; AVX1-NEXT: vpextrb $3, %xmm1, %ecx -; AVX1-NEXT: vpextrb $3, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r14b -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB1_28: -; AVX1-NEXT: vpextrb $2, %xmm1, %ecx -; AVX1-NEXT: vpextrb $2, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r8b -; AVX1-NEXT: jno .LBB1_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB1_30: -; AVX1-NEXT: vpextrb $0, %xmm1, %ecx -; AVX1-NEXT: vpextrb $0, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r10b -; AVX1-NEXT: jno .LBB1_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB1_32: -; AVX1-NEXT: vpextrb $1, %xmm1, %ecx -; AVX1-NEXT: vpextrb $1, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r9b -; AVX1-NEXT: jno .LBB1_34 -; AVX1-NEXT: # %bb.33: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: .LBB1_34: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: movl %esi, %r12d -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: jno .LBB1_36 -; AVX1-NEXT: # %bb.35: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB1_36: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: movl %edx, %r13d -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: jno .LBB1_38 -; AVX1-NEXT: # %bb.37: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB1_38: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: movl %edi, %ebp -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: jno .LBB1_40 -; AVX1-NEXT: # %bb.39: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB1_40: -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jo .LBB1_41 -; AVX1-NEXT: # %bb.42: -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB1_43 -; AVX1-NEXT: .LBB1_41: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB1_43: -; AVX1-NEXT: movl %ebp, %edi -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r15b -; AVX1-NEXT: movl %r12d, %esi -; AVX1-NEXT: movl %r13d, %edx -; AVX1-NEXT: jno .LBB1_45 -; AVX1-NEXT: # %bb.44: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB1_45: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r12b -; AVX1-NEXT: jno .LBB1_47 -; AVX1-NEXT: # %bb.46: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB1_47: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r13b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_49 -; AVX1-NEXT: # %bb.48: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB1_49: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r11b -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB1_51 -; AVX1-NEXT: # %bb.50: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB1_51: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r10b -; AVX1-NEXT: jno .LBB1_53 -; AVX1-NEXT: # %bb.52: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB1_53: -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB1_55 -; AVX1-NEXT: # %bb.54: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB1_55: -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB1_57 -; AVX1-NEXT: # %bb.56: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB1_57: -; AVX1-NEXT: vpextrb $4, %xmm1, %edx -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: setns %cl -; AVX1-NEXT: subb %dl, %al -; AVX1-NEXT: jno .LBB1_59 -; AVX1-NEXT: # %bb.58: -; AVX1-NEXT: addb $127, %cl -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB1_59: -; AVX1-NEXT: vpextrb $3, %xmm1, %ebx -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: subb %bl, %dl -; AVX1-NEXT: setns %dl -; AVX1-NEXT: subb %bl, %cl -; AVX1-NEXT: jno .LBB1_61 -; AVX1-NEXT: # %bb.60: -; AVX1-NEXT: addb $127, %dl -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: .LBB1_61: -; AVX1-NEXT: vpextrb $2, %xmm1, %esi -; AVX1-NEXT: vpextrb $2, %xmm0, %edx -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %dl -; AVX1-NEXT: jno .LBB1_63 -; AVX1-NEXT: # %bb.62: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: .LBB1_63: -; AVX1-NEXT: vpextrb $0, %xmm1, %esi -; AVX1-NEXT: vpextrb $0, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %r8b -; AVX1-NEXT: jo .LBB1_64 -; AVX1-NEXT: # %bb.65: -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX1-NEXT: jmp .LBB1_66 -; AVX1-NEXT: .LBB1_64: -; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r8d -; AVX1-NEXT: .LBB1_66: -; AVX1-NEXT: vpextrb $1, %xmm1, %esi -; AVX1-NEXT: vpextrb $1, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %r9b -; AVX1-NEXT: jno .LBB1_68 -; AVX1-NEXT: # %bb.67: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r9d -; AVX1-NEXT: .LBB1_68: -; AVX1-NEXT: movzbl %r8b, %esi -; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: movzbl %r9b, %esi -; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dl, %edx -; AVX1-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r10b, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r11b, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: jo .LBB1_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB1_3 -; AVX2-NEXT: .LBB1_1: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB1_3: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: jno .LBB1_5 -; AVX2-NEXT: # %bb.4: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB1_5: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: jo .LBB1_6 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB1_8 -; AVX2-NEXT: .LBB1_6: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB1_8: -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: jno .LBB1_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_10: -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB1_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB1_12: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB1_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB1_14: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB1_16: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_18: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB1_20: -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB1_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB1_22: -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB1_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB1_24: -; AVX2-NEXT: vpextrb $4, %xmm1, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r11b -; AVX2-NEXT: jno .LBB1_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB1_26: -; AVX2-NEXT: vpextrb $3, %xmm1, %ecx -; AVX2-NEXT: vpextrb $3, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r14b -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB1_28: -; AVX2-NEXT: vpextrb $2, %xmm1, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r8b -; AVX2-NEXT: jno .LBB1_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB1_30: -; AVX2-NEXT: vpextrb $0, %xmm1, %ecx -; AVX2-NEXT: vpextrb $0, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r10b -; AVX2-NEXT: jno .LBB1_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB1_32: -; AVX2-NEXT: vpextrb $1, %xmm1, %ecx -; AVX2-NEXT: vpextrb $1, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r9b -; AVX2-NEXT: jno .LBB1_34 -; AVX2-NEXT: # %bb.33: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: .LBB1_34: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: movl %esi, %r12d -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: jno .LBB1_36 -; AVX2-NEXT: # %bb.35: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB1_36: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: movl %edx, %r13d -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: jno .LBB1_38 -; AVX2-NEXT: # %bb.37: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB1_38: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: movl %edi, %ebp -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: jno .LBB1_40 -; AVX2-NEXT: # %bb.39: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB1_40: -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jo .LBB1_41 -; AVX2-NEXT: # %bb.42: -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB1_43 -; AVX2-NEXT: .LBB1_41: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB1_43: -; AVX2-NEXT: movl %ebp, %edi -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r15b -; AVX2-NEXT: movl %r12d, %esi -; AVX2-NEXT: movl %r13d, %edx -; AVX2-NEXT: jno .LBB1_45 -; AVX2-NEXT: # %bb.44: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB1_45: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r12b -; AVX2-NEXT: jno .LBB1_47 -; AVX2-NEXT: # %bb.46: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB1_47: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r13b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_49 -; AVX2-NEXT: # %bb.48: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB1_49: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r11b -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB1_51 -; AVX2-NEXT: # %bb.50: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB1_51: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r10b -; AVX2-NEXT: jno .LBB1_53 -; AVX2-NEXT: # %bb.52: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB1_53: -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB1_55 -; AVX2-NEXT: # %bb.54: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB1_55: -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB1_57 -; AVX2-NEXT: # %bb.56: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB1_57: -; AVX2-NEXT: vpextrb $4, %xmm1, %edx -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: setns %cl -; AVX2-NEXT: subb %dl, %al -; AVX2-NEXT: jno .LBB1_59 -; AVX2-NEXT: # %bb.58: -; AVX2-NEXT: addb $127, %cl -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB1_59: -; AVX2-NEXT: vpextrb $3, %xmm1, %ebx -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: subb %bl, %dl -; AVX2-NEXT: setns %dl -; AVX2-NEXT: subb %bl, %cl -; AVX2-NEXT: jno .LBB1_61 -; AVX2-NEXT: # %bb.60: -; AVX2-NEXT: addb $127, %dl -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: .LBB1_61: -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: vpextrb $2, %xmm0, %edx -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %dl -; AVX2-NEXT: jno .LBB1_63 -; AVX2-NEXT: # %bb.62: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: .LBB1_63: -; AVX2-NEXT: vpextrb $0, %xmm1, %esi -; AVX2-NEXT: vpextrb $0, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %r8b -; AVX2-NEXT: jo .LBB1_64 -; AVX2-NEXT: # %bb.65: -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX2-NEXT: jmp .LBB1_66 -; AVX2-NEXT: .LBB1_64: -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r8d -; AVX2-NEXT: .LBB1_66: -; AVX2-NEXT: vpextrb $1, %xmm1, %esi -; AVX2-NEXT: vpextrb $1, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %r9b -; AVX2-NEXT: jno .LBB1_68 -; AVX2-NEXT: # %bb.67: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r9d -; AVX2-NEXT: .LBB1_68: -; AVX2-NEXT: movzbl %r8b, %esi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: movzbl %r9b, %esi -; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: jo .LBB1_1 -; AVX512-NEXT: # %bb.2: -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB1_3 -; AVX512-NEXT: .LBB1_1: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB1_3: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: jno .LBB1_5 -; AVX512-NEXT: # %bb.4: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB1_5: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: jo .LBB1_6 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB1_8 -; AVX512-NEXT: .LBB1_6: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB1_8: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: jno .LBB1_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_10: -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB1_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB1_12: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB1_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB1_14: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB1_16: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_18: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB1_20: -; AVX512-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512-NEXT: vpextrb $6, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB1_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB1_22: -; AVX512-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512-NEXT: vpextrb $5, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB1_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB1_24: -; AVX512-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512-NEXT: vpextrb $4, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r11b -; AVX512-NEXT: jno .LBB1_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB1_26: -; AVX512-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512-NEXT: vpextrb $3, %xmm0, %r14d -; AVX512-NEXT: movl %r14d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r14b -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: .LBB1_28: -; AVX512-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512-NEXT: vpextrb $2, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r8b -; AVX512-NEXT: jno .LBB1_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB1_30: -; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %r10d -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r10b -; AVX512-NEXT: jno .LBB1_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r10d -; AVX512-NEXT: .LBB1_32: -; AVX512-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512-NEXT: vpextrb $1, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r9b -; AVX512-NEXT: jno .LBB1_34 -; AVX512-NEXT: # %bb.33: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r9d -; AVX512-NEXT: .LBB1_34: -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpextrb $15, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: movl %esi, %r12d -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: jno .LBB1_36 -; AVX512-NEXT: # %bb.35: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB1_36: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: movl %edx, %r13d -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: jno .LBB1_38 -; AVX512-NEXT: # %bb.37: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB1_38: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: movl %edi, %ebp -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: jno .LBB1_40 -; AVX512-NEXT: # %bb.39: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB1_40: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jo .LBB1_41 -; AVX512-NEXT: # %bb.42: -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB1_43 -; AVX512-NEXT: .LBB1_41: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB1_43: -; AVX512-NEXT: movl %ebp, %edi -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r15b -; AVX512-NEXT: movl %r12d, %esi -; AVX512-NEXT: movl %r13d, %edx -; AVX512-NEXT: jno .LBB1_45 -; AVX512-NEXT: # %bb.44: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB1_45: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r12b -; AVX512-NEXT: jno .LBB1_47 -; AVX512-NEXT: # %bb.46: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: .LBB1_47: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r13b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_49 -; AVX512-NEXT: # %bb.48: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB1_49: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r11b -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB1_51 -; AVX512-NEXT: # %bb.50: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB1_51: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %r10d -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r10b -; AVX512-NEXT: jno .LBB1_53 -; AVX512-NEXT: # %bb.52: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r10d -; AVX512-NEXT: .LBB1_53: -; AVX512-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512-NEXT: vpextrb $6, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB1_55 -; AVX512-NEXT: # %bb.54: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB1_55: -; AVX512-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512-NEXT: vpextrb $5, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB1_57 -; AVX512-NEXT: # %bb.56: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB1_57: -; AVX512-NEXT: vpextrb $4, %xmm1, %edx -; AVX512-NEXT: vpextrb $4, %xmm0, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %dl, %al -; AVX512-NEXT: jno .LBB1_59 -; AVX512-NEXT: # %bb.58: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB1_59: -; AVX512-NEXT: vpextrb $3, %xmm1, %ebx -; AVX512-NEXT: vpextrb $3, %xmm0, %ecx -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: subb %bl, %dl -; AVX512-NEXT: setns %dl -; AVX512-NEXT: subb %bl, %cl -; AVX512-NEXT: jno .LBB1_61 -; AVX512-NEXT: # %bb.60: -; AVX512-NEXT: addb $127, %dl -; AVX512-NEXT: movl %edx, %ecx -; AVX512-NEXT: .LBB1_61: -; AVX512-NEXT: vpextrb $2, %xmm1, %esi -; AVX512-NEXT: vpextrb $2, %xmm0, %edx -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %dl -; AVX512-NEXT: jno .LBB1_63 -; AVX512-NEXT: # %bb.62: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: .LBB1_63: -; AVX512-NEXT: vpextrb $0, %xmm1, %esi -; AVX512-NEXT: vpextrb $0, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %r8b -; AVX512-NEXT: jo .LBB1_64 -; AVX512-NEXT: # %bb.65: -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX512-NEXT: jmp .LBB1_66 -; AVX512-NEXT: .LBB1_64: -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r8d -; AVX512-NEXT: .LBB1_66: -; AVX512-NEXT: vpextrb $1, %xmm1, %esi -; AVX512-NEXT: vpextrb $1, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %r9b -; AVX512-NEXT: jno .LBB1_68 -; AVX512-NEXT: # %bb.67: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r9d -; AVX512-NEXT: .LBB1_68: -; AVX512-NEXT: movzbl %r8b, %esi -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: movzbl %r9b, %esi -; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dl, %edx -; AVX512-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %cl, %ecx -; AVX512-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dil, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %bpl, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r10b, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r11b, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r13b, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r12b, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r15b, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl %r14b, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y) ret <32 x i8> %z } define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { -; SSE2-LABEL: v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $232, %rsp -; SSE2-NEXT: movaps %xmm5, (%rsp) -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb (%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB2_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_2: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB2_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_4: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB2_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_6: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB2_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_8: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB2_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_10: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: jno .LBB2_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_12: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_14: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jo .LBB2_15 -; SSE2-NEXT: # %bb.16: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_17 -; SSE2-NEXT: .LBB2_15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_17: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB2_19 -; SSE2-NEXT: # %bb.18: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_19: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB2_21 -; SSE2-NEXT: # %bb.20: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_21: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_23 -; SSE2-NEXT: # %bb.22: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_23: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB2_25 -; SSE2-NEXT: # %bb.24: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_25: -; SSE2-NEXT: movl %edi, %r8d -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jo .LBB2_26 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movl %esi, %edi -; SSE2-NEXT: jmp .LBB2_28 -; SSE2-NEXT: .LBB2_26: -; SSE2-NEXT: movl %esi, %edi -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_28: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_30: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB2_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_32: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_34 -; SSE2-NEXT: # %bb.33: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_34: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB2_36 -; SSE2-NEXT: # %bb.35: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_36: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_38 -; SSE2-NEXT: # %bb.37: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_38: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_40 -; SSE2-NEXT: # %bb.39: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_40: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_42 -; SSE2-NEXT: # %bb.41: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB2_42: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_44 -; SSE2-NEXT: # %bb.43: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_44: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB2_46 -; SSE2-NEXT: # %bb.45: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_46: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB2_48 -; SSE2-NEXT: # %bb.47: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_48: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_50 -; SSE2-NEXT: # %bb.49: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_50: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_52 -; SSE2-NEXT: # %bb.51: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_52: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB2_54 -; SSE2-NEXT: # %bb.53: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_54: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_56 -; SSE2-NEXT: # %bb.55: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_56: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_58 -; SSE2-NEXT: # %bb.57: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_58: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_60 -; SSE2-NEXT: # %bb.59: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_60: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB2_62 -; SSE2-NEXT: # %bb.61: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_62: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: jno .LBB2_64 -; SSE2-NEXT: # %bb.63: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_64: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jo .LBB2_65 -; SSE2-NEXT: # %bb.66: -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_67 -; SSE2-NEXT: .LBB2_65: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_67: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB2_69 -; SSE2-NEXT: # %bb.68: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_69: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_71 -; SSE2-NEXT: # %bb.70: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_71: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB2_73 -; SSE2-NEXT: # %bb.72: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_73: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: jno .LBB2_75 -; SSE2-NEXT: # %bb.74: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_75: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: jno .LBB2_77 -; SSE2-NEXT: # %bb.76: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_77: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: jno .LBB2_79 -; SSE2-NEXT: # %bb.78: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB2_79: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: jno .LBB2_81 -; SSE2-NEXT: # %bb.80: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB2_81: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_83 -; SSE2-NEXT: # %bb.82: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_83: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jo .LBB2_84 -; SSE2-NEXT: # %bb.85: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_86 -; SSE2-NEXT: .LBB2_84: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_86: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_88 -; SSE2-NEXT: # %bb.87: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_88: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jo .LBB2_89 -; SSE2-NEXT: # %bb.90: -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jmp .LBB2_91 -; SSE2-NEXT: .LBB2_89: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_91: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_93 -; SSE2-NEXT: # %bb.92: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_93: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB2_95 -; SSE2-NEXT: # %bb.94: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB2_95: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_97 -; SSE2-NEXT: # %bb.96: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_97: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB2_99 -; SSE2-NEXT: # %bb.98: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_99: -; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_101 -; SSE2-NEXT: # %bb.100: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_101: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB2_103 -; SSE2-NEXT: # %bb.102: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_103: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: jno .LBB2_105 -; SSE2-NEXT: # %bb.104: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_105: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB2_107 -; SSE2-NEXT: # %bb.106: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB2_107: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_109 -; SSE2-NEXT: # %bb.108: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: .LBB2_109: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_111 -; SSE2-NEXT: # %bb.110: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB2_111: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_113 -; SSE2-NEXT: # %bb.112: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB2_113: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: movl %r8d, %edx -; SSE2-NEXT: jno .LBB2_115 -; SSE2-NEXT: # %bb.114: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB2_115: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_117 -; SSE2-NEXT: # %bb.116: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB2_117: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB2_119 -; SSE2-NEXT: # %bb.118: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB2_119: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB2_121 -; SSE2-NEXT: # %bb.120: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB2_121: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB2_123 -; SSE2-NEXT: # %bb.122: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB2_123: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload -; SSE2-NEXT: jno .LBB2_125 -; SSE2-NEXT: # %bb.124: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB2_125: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB2_127 -; SSE2-NEXT: # %bb.126: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB2_127: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: subb %bl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %bl, %cl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB2_129 -; SSE2-NEXT: # %bb.128: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: .LBB2_129: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %dl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %dl, %bl -; SSE2-NEXT: jno .LBB2_131 -; SSE2-NEXT: # %bb.130: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB2_131: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %dl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %dl, %r9b -; SSE2-NEXT: jno .LBB2_133 -; SSE2-NEXT: # %bb.132: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB2_133: -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload -; SSE2-NEXT: movzbl %r9b, %ebp -; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %bl, %ebp -; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %cl, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r11b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r10b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r8b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %dil, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r13b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r12b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r15b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %r14b, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %dl, %ecx -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE2-NEXT: movd %r13d, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSE2-NEXT: movd %r12d, %xmm1 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: movd %r14d, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE2-NEXT: movd %ebx, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSE2-NEXT: movd %r11d, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE2-NEXT: movd %ecx, %xmm6 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: movd %edx, %xmm13 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE2-NEXT: movd %edi, %xmm5 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: movd %esi, %xmm15 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movd %r10d, %xmm10 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE2-NEXT: movd %r15d, %xmm2 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE2-NEXT: movd %r12d, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE2-NEXT: movd %r9d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE2-NEXT: movd %r11d, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE2-NEXT: movd %r14d, %xmm12 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSE2-NEXT: movd %edi, %xmm7 -; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSE2-NEXT: movd %r13d, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE2-NEXT: movd %esi, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: movd %r8d, %xmm15 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: movd %ebp, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movd %ecx, %xmm13 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: movd %r10d, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: movd %ebx, %xmm10 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE2-NEXT: movd %r15d, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE2-NEXT: movd %r12d, %xmm3 -; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: addq $232, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v64i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: subq $232, %rsp -; SSSE3-NEXT: movaps %xmm5, (%rsp) -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb (%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB2_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_2: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB2_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_4: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB2_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_6: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB2_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_8: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB2_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_10: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: jno .LBB2_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_12: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_14: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jo .LBB2_15 -; SSSE3-NEXT: # %bb.16: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_17 -; SSSE3-NEXT: .LBB2_15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_17: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB2_19 -; SSSE3-NEXT: # %bb.18: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_19: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB2_21 -; SSSE3-NEXT: # %bb.20: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_21: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_23 -; SSSE3-NEXT: # %bb.22: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_23: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB2_25 -; SSSE3-NEXT: # %bb.24: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_25: -; SSSE3-NEXT: movl %edi, %r8d -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jo .LBB2_26 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movl %esi, %edi -; SSSE3-NEXT: jmp .LBB2_28 -; SSSE3-NEXT: .LBB2_26: -; SSSE3-NEXT: movl %esi, %edi -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_28: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_30: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB2_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_32: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_34 -; SSSE3-NEXT: # %bb.33: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_34: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB2_36 -; SSSE3-NEXT: # %bb.35: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_36: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_38 -; SSSE3-NEXT: # %bb.37: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_38: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_40 -; SSSE3-NEXT: # %bb.39: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_40: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_42 -; SSSE3-NEXT: # %bb.41: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB2_42: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_44 -; SSSE3-NEXT: # %bb.43: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_44: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB2_46 -; SSSE3-NEXT: # %bb.45: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_46: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB2_48 -; SSSE3-NEXT: # %bb.47: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_48: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_50 -; SSSE3-NEXT: # %bb.49: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_50: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_52 -; SSSE3-NEXT: # %bb.51: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_52: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB2_54 -; SSSE3-NEXT: # %bb.53: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_54: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_56 -; SSSE3-NEXT: # %bb.55: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_56: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_58 -; SSSE3-NEXT: # %bb.57: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_58: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_60 -; SSSE3-NEXT: # %bb.59: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_60: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB2_62 -; SSSE3-NEXT: # %bb.61: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_62: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: jno .LBB2_64 -; SSSE3-NEXT: # %bb.63: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_64: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jo .LBB2_65 -; SSSE3-NEXT: # %bb.66: -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_67 -; SSSE3-NEXT: .LBB2_65: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_67: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB2_69 -; SSSE3-NEXT: # %bb.68: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_69: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_71 -; SSSE3-NEXT: # %bb.70: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_71: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB2_73 -; SSSE3-NEXT: # %bb.72: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_73: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: jno .LBB2_75 -; SSSE3-NEXT: # %bb.74: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_75: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: jno .LBB2_77 -; SSSE3-NEXT: # %bb.76: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_77: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: jno .LBB2_79 -; SSSE3-NEXT: # %bb.78: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB2_79: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: jno .LBB2_81 -; SSSE3-NEXT: # %bb.80: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB2_81: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_83 -; SSSE3-NEXT: # %bb.82: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_83: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jo .LBB2_84 -; SSSE3-NEXT: # %bb.85: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_86 -; SSSE3-NEXT: .LBB2_84: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_86: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_88 -; SSSE3-NEXT: # %bb.87: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_88: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jo .LBB2_89 -; SSSE3-NEXT: # %bb.90: -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jmp .LBB2_91 -; SSSE3-NEXT: .LBB2_89: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_91: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_93 -; SSSE3-NEXT: # %bb.92: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_93: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB2_95 -; SSSE3-NEXT: # %bb.94: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB2_95: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_97 -; SSSE3-NEXT: # %bb.96: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_97: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB2_99 -; SSSE3-NEXT: # %bb.98: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_99: -; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_101 -; SSSE3-NEXT: # %bb.100: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_101: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB2_103 -; SSSE3-NEXT: # %bb.102: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_103: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: jno .LBB2_105 -; SSSE3-NEXT: # %bb.104: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_105: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB2_107 -; SSSE3-NEXT: # %bb.106: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB2_107: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_109 -; SSSE3-NEXT: # %bb.108: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: .LBB2_109: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_111 -; SSSE3-NEXT: # %bb.110: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB2_111: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_113 -; SSSE3-NEXT: # %bb.112: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB2_113: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: movl %r8d, %edx -; SSSE3-NEXT: jno .LBB2_115 -; SSSE3-NEXT: # %bb.114: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB2_115: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_117 -; SSSE3-NEXT: # %bb.116: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB2_117: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB2_119 -; SSSE3-NEXT: # %bb.118: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB2_119: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB2_121 -; SSSE3-NEXT: # %bb.120: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB2_121: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB2_123 -; SSSE3-NEXT: # %bb.122: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB2_123: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload -; SSSE3-NEXT: jno .LBB2_125 -; SSSE3-NEXT: # %bb.124: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB2_125: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB2_127 -; SSSE3-NEXT: # %bb.126: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB2_127: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: subb %bl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %bl, %cl -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB2_129 -; SSSE3-NEXT: # %bb.128: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: .LBB2_129: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %dl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %dl, %bl -; SSSE3-NEXT: jno .LBB2_131 -; SSSE3-NEXT: # %bb.130: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB2_131: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %dl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %dl, %r9b -; SSSE3-NEXT: jno .LBB2_133 -; SSSE3-NEXT: # %bb.132: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB2_133: -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload -; SSSE3-NEXT: movzbl %r9b, %ebp -; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %bl, %ebp -; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %cl, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r11b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r10b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r8b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %dil, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r13b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r12b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r15b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %r14b, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %dl, %ecx -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSSE3-NEXT: movd %r13d, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSSE3-NEXT: movd %r12d, %xmm1 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSSE3-NEXT: movd %r14d, %xmm14 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSSE3-NEXT: movd %ebp, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSSE3-NEXT: movd %ebx, %xmm12 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSSE3-NEXT: movd %r11d, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSSE3-NEXT: movd %ecx, %xmm6 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSSE3-NEXT: movd %edx, %xmm13 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSSE3-NEXT: movd %edi, %xmm5 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSSE3-NEXT: movd %esi, %xmm15 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movd %r10d, %xmm10 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSSE3-NEXT: movd %r15d, %xmm2 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSSE3-NEXT: movd %r12d, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSSE3-NEXT: movd %r9d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSSE3-NEXT: movd %r11d, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSSE3-NEXT: movd %r14d, %xmm12 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSSE3-NEXT: movd %edi, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSSE3-NEXT: movd %r13d, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSSE3-NEXT: movd %esi, %xmm14 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSSE3-NEXT: movd %r8d, %xmm15 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSSE3-NEXT: movd %ebp, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movd %ecx, %xmm13 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: movd %r10d, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: movd %ebx, %xmm10 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSSE3-NEXT: movd %r15d, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSSE3-NEXT: movd %eax, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSSE3-NEXT: movd %r12d, %xmm3 -; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: addq $232, %rsp -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm5, %ecx -; SSE41-NEXT: pextrb $15, %xmm1, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: jno .LBB2_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB2_2: -; SSE41-NEXT: pextrb $14, %xmm5, %ecx -; SSE41-NEXT: pextrb $14, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: jno .LBB2_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_4: -; SSE41-NEXT: pextrb $13, %xmm5, %ecx -; SSE41-NEXT: pextrb $13, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB2_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_6: -; SSE41-NEXT: pextrb $12, %xmm5, %ecx -; SSE41-NEXT: pextrb $12, %xmm1, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB2_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_8: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: subq $76, %rsp -; SSE41-NEXT: pextrb $11, %xmm5, %ecx -; SSE41-NEXT: pextrb $11, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB2_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_10: -; SSE41-NEXT: pextrb $10, %xmm5, %ecx -; SSE41-NEXT: pextrb $10, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_12: -; SSE41-NEXT: pextrb $9, %xmm5, %ecx -; SSE41-NEXT: pextrb $9, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB2_13 -; SSE41-NEXT: # %bb.14: -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB2_15 -; SSE41-NEXT: .LBB2_13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_15: -; SSE41-NEXT: pextrb $8, %xmm5, %ecx -; SSE41-NEXT: pextrb $8, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB2_17 -; SSE41-NEXT: # %bb.16: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_17: -; SSE41-NEXT: pextrb $7, %xmm5, %ecx -; SSE41-NEXT: pextrb $7, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_19 -; SSE41-NEXT: # %bb.18: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_19: -; SSE41-NEXT: pextrb $6, %xmm5, %ecx -; SSE41-NEXT: pextrb $6, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_21 -; SSE41-NEXT: # %bb.20: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_21: -; SSE41-NEXT: pextrb $5, %xmm5, %ecx -; SSE41-NEXT: pextrb $5, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB2_22 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB2_24 -; SSE41-NEXT: .LBB2_22: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_24: -; SSE41-NEXT: pextrb $4, %xmm5, %ecx -; SSE41-NEXT: pextrb $4, %xmm1, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: jno .LBB2_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB2_26: -; SSE41-NEXT: pextrb $3, %xmm5, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: jno .LBB2_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_28: -; SSE41-NEXT: pextrb $2, %xmm5, %ecx -; SSE41-NEXT: pextrb $2, %xmm1, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_30: -; SSE41-NEXT: pextrb $0, %xmm5, %ecx -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_32: -; SSE41-NEXT: pextrb $1, %xmm5, %ecx -; SSE41-NEXT: pextrb $1, %xmm1, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_34 -; SSE41-NEXT: # %bb.33: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_34: -; SSE41-NEXT: pextrb $15, %xmm6, %ecx -; SSE41-NEXT: pextrb $15, %xmm2, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r9b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_36 -; SSE41-NEXT: # %bb.35: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB2_36: -; SSE41-NEXT: pextrb $14, %xmm6, %ecx -; SSE41-NEXT: pextrb $14, %xmm2, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_38 -; SSE41-NEXT: # %bb.37: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_38: -; SSE41-NEXT: pextrb $13, %xmm6, %ecx -; SSE41-NEXT: pextrb $13, %xmm2, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_40 -; SSE41-NEXT: # %bb.39: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB2_40: -; SSE41-NEXT: pextrb $12, %xmm6, %ecx -; SSE41-NEXT: pextrb $12, %xmm2, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: jno .LBB2_42 -; SSE41-NEXT: # %bb.41: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB2_42: -; SSE41-NEXT: pextrb $11, %xmm6, %ecx -; SSE41-NEXT: pextrb $11, %xmm2, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_44 -; SSE41-NEXT: # %bb.43: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_44: -; SSE41-NEXT: pextrb $10, %xmm6, %ecx -; SSE41-NEXT: pextrb $10, %xmm2, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB2_46 -; SSE41-NEXT: # %bb.45: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_46: -; SSE41-NEXT: pextrb $9, %xmm6, %ecx -; SSE41-NEXT: pextrb $9, %xmm2, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB2_48 -; SSE41-NEXT: # %bb.47: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_48: -; SSE41-NEXT: pextrb $8, %xmm6, %ecx -; SSE41-NEXT: pextrb $8, %xmm2, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB2_50 -; SSE41-NEXT: # %bb.49: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_50: -; SSE41-NEXT: pextrb $7, %xmm6, %ecx -; SSE41-NEXT: pextrb $7, %xmm2, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_52 -; SSE41-NEXT: # %bb.51: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_52: -; SSE41-NEXT: pextrb $6, %xmm6, %ecx -; SSE41-NEXT: pextrb $6, %xmm2, %r8d -; SSE41-NEXT: movl %r8d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r8b -; SSE41-NEXT: jno .LBB2_54 -; SSE41-NEXT: # %bb.53: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB2_54: -; SSE41-NEXT: pextrb $5, %xmm6, %ecx -; SSE41-NEXT: pextrb $5, %xmm2, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_56 -; SSE41-NEXT: # %bb.55: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB2_56: -; SSE41-NEXT: pextrb $4, %xmm6, %ecx -; SSE41-NEXT: pextrb $4, %xmm2, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_58 -; SSE41-NEXT: # %bb.57: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_58: -; SSE41-NEXT: pextrb $3, %xmm6, %ecx -; SSE41-NEXT: pextrb $3, %xmm2, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_60 -; SSE41-NEXT: # %bb.59: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_60: -; SSE41-NEXT: pextrb $2, %xmm6, %ecx -; SSE41-NEXT: pextrb $2, %xmm2, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_62 -; SSE41-NEXT: # %bb.61: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_62: -; SSE41-NEXT: pextrb $0, %xmm6, %ecx -; SSE41-NEXT: pextrb $0, %xmm2, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB2_64 -; SSE41-NEXT: # %bb.63: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_64: -; SSE41-NEXT: pextrb $1, %xmm6, %ecx -; SSE41-NEXT: pextrb $1, %xmm2, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_66 -; SSE41-NEXT: # %bb.65: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebx -; SSE41-NEXT: .LBB2_66: -; SSE41-NEXT: pextrb $15, %xmm7, %ecx -; SSE41-NEXT: pextrb $15, %xmm3, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_68 -; SSE41-NEXT: # %bb.67: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_68: -; SSE41-NEXT: pextrb $14, %xmm7, %ecx -; SSE41-NEXT: pextrb $14, %xmm3, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_70 -; SSE41-NEXT: # %bb.69: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_70: -; SSE41-NEXT: pextrb $13, %xmm7, %ecx -; SSE41-NEXT: pextrb $13, %xmm3, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_72 -; SSE41-NEXT: # %bb.71: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_72: -; SSE41-NEXT: pextrb $12, %xmm7, %ecx -; SSE41-NEXT: pextrb $12, %xmm3, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r15b -; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_74 -; SSE41-NEXT: # %bb.73: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_74: -; SSE41-NEXT: pextrb $11, %xmm7, %ecx -; SSE41-NEXT: pextrb $11, %xmm3, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB2_76 -; SSE41-NEXT: # %bb.75: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_76: -; SSE41-NEXT: pextrb $10, %xmm7, %ecx -; SSE41-NEXT: pextrb $10, %xmm3, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_78 -; SSE41-NEXT: # %bb.77: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_78: -; SSE41-NEXT: pextrb $9, %xmm7, %ecx -; SSE41-NEXT: pextrb $9, %xmm3, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_80 -; SSE41-NEXT: # %bb.79: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB2_80: -; SSE41-NEXT: pextrb $8, %xmm7, %ecx -; SSE41-NEXT: pextrb $8, %xmm3, %r8d -; SSE41-NEXT: movl %r8d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r8b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_82 -; SSE41-NEXT: # %bb.81: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: .LBB2_82: -; SSE41-NEXT: pextrb $7, %xmm7, %ecx -; SSE41-NEXT: pextrb $7, %xmm3, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r9b -; SSE41-NEXT: jno .LBB2_84 -; SSE41-NEXT: # %bb.83: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB2_84: -; SSE41-NEXT: pextrb $6, %xmm7, %ecx -; SSE41-NEXT: pextrb $6, %xmm3, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: jno .LBB2_86 -; SSE41-NEXT: # %bb.85: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB2_86: -; SSE41-NEXT: pextrb $5, %xmm7, %ecx -; SSE41-NEXT: pextrb $5, %xmm3, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: jno .LBB2_88 -; SSE41-NEXT: # %bb.87: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB2_88: -; SSE41-NEXT: pextrb $4, %xmm7, %ecx -; SSE41-NEXT: pextrb $4, %xmm3, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_90 -; SSE41-NEXT: # %bb.89: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_90: -; SSE41-NEXT: pextrb $3, %xmm7, %ecx -; SSE41-NEXT: pextrb $3, %xmm3, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_92 -; SSE41-NEXT: # %bb.91: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB2_92: -; SSE41-NEXT: pextrb $2, %xmm7, %ecx -; SSE41-NEXT: pextrb $2, %xmm3, %esi -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %sil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_94 -; SSE41-NEXT: # %bb.93: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: .LBB2_94: -; SSE41-NEXT: pextrb $0, %xmm7, %ecx -; SSE41-NEXT: pextrb $0, %xmm3, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: movl %edi, %r15d -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB2_96 -; SSE41-NEXT: # %bb.95: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB2_96: -; SSE41-NEXT: pextrb $1, %xmm7, %ecx -; SSE41-NEXT: pextrb $1, %xmm3, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB2_98 -; SSE41-NEXT: # %bb.97: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_98: -; SSE41-NEXT: pextrb $15, %xmm4, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_100 -; SSE41-NEXT: # %bb.99: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_100: -; SSE41-NEXT: pextrb $14, %xmm4, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB2_102 -; SSE41-NEXT: # %bb.101: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_102: -; SSE41-NEXT: pextrb $13, %xmm4, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %ebx -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bl -; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jo .LBB2_103 -; SSE41-NEXT: # %bb.104: -; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jmp .LBB2_105 -; SSE41-NEXT: .LBB2_103: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: # kill: def $al killed $al def $eax -; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: .LBB2_105: -; SSE41-NEXT: pextrb $12, %xmm4, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: jno .LBB2_107 -; SSE41-NEXT: # %bb.106: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB2_107: -; SSE41-NEXT: pextrb $11, %xmm4, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB2_109 -; SSE41-NEXT: # %bb.108: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB2_109: -; SSE41-NEXT: pextrb $10, %xmm4, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r15b -; SSE41-NEXT: jno .LBB2_111 -; SSE41-NEXT: # %bb.110: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB2_111: -; SSE41-NEXT: pextrb $9, %xmm4, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: jno .LBB2_113 -; SSE41-NEXT: # %bb.112: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB2_113: -; SSE41-NEXT: pextrb $8, %xmm4, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB2_115 -; SSE41-NEXT: # %bb.114: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB2_115: -; SSE41-NEXT: pextrb $7, %xmm4, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB2_117 -; SSE41-NEXT: # %bb.116: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB2_117: -; SSE41-NEXT: pextrb $6, %xmm4, %edx -; SSE41-NEXT: pextrb $6, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: jno .LBB2_119 -; SSE41-NEXT: # %bb.118: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB2_119: -; SSE41-NEXT: pextrb $5, %xmm4, %ebx -; SSE41-NEXT: pextrb $5, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: subb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: subb %bl, %cl -; SSE41-NEXT: jno .LBB2_121 -; SSE41-NEXT: # %bb.120: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB2_121: -; SSE41-NEXT: pextrb $4, %xmm4, %esi -; SSE41-NEXT: pextrb $4, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: jno .LBB2_123 -; SSE41-NEXT: # %bb.122: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB2_123: -; SSE41-NEXT: pextrb $3, %xmm4, %esi -; SSE41-NEXT: pextrb $3, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r8b -; SSE41-NEXT: jno .LBB2_125 -; SSE41-NEXT: # %bb.124: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB2_125: -; SSE41-NEXT: pextrb $2, %xmm4, %esi -; SSE41-NEXT: pextrb $2, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r9b -; SSE41-NEXT: jno .LBB2_127 -; SSE41-NEXT: # %bb.126: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r9d -; SSE41-NEXT: .LBB2_127: -; SSE41-NEXT: pextrb $0, %xmm4, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r10b -; SSE41-NEXT: jno .LBB2_129 -; SSE41-NEXT: # %bb.128: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r10d -; SSE41-NEXT: .LBB2_129: -; SSE41-NEXT: pextrb $1, %xmm4, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r11b -; SSE41-NEXT: jno .LBB2_131 -; SSE41-NEXT: # %bb.130: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r11d -; SSE41-NEXT: .LBB2_131: -; SSE41-NEXT: movzbl %r10b, %esi -; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: movzbl %r11b, %esi -; SSE41-NEXT: pinsrb $1, %esi, %xmm0 -; SSE41-NEXT: movzbl %r9b, %esi -; SSE41-NEXT: pinsrb $2, %esi, %xmm0 -; SSE41-NEXT: movzbl %r8b, %esi -; SSE41-NEXT: pinsrb $3, %esi, %xmm0 -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: pinsrb $4, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm2 -; SSE41-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm2 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: movd %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $1, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $2, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $3, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $4, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $5, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $6, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $7, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $8, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $9, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $10, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $12, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $14, %eax, %xmm3 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm3 -; SSE41-NEXT: addq $76, %rsp -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v64i8: +; SSE: # %bb.0: +; SSE-NEXT: psubsb %xmm4, %xmm0 +; SSE-NEXT: psubsb %xmm5, %xmm1 +; SSE-NEXT: psubsb %xmm6, %xmm2 +; SSE-NEXT: psubsb %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $76, %rsp -; AVX1-NEXT: vpextrb $15, %xmm3, %ecx -; AVX1-NEXT: vpextrb $15, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: jo .LBB2_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_3 -; AVX1-NEXT: .LBB2_1: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_3: -; AVX1-NEXT: vpextrb $14, %xmm3, %ecx -; AVX1-NEXT: vpextrb $14, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: jno .LBB2_5 -; AVX1-NEXT: # %bb.4: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_5: -; AVX1-NEXT: vpextrb $13, %xmm3, %ecx -; AVX1-NEXT: vpextrb $13, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: jo .LBB2_6 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_8 -; AVX1-NEXT: .LBB2_6: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_8: -; AVX1-NEXT: vpextrb $12, %xmm3, %ecx -; AVX1-NEXT: vpextrb $12, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: jno .LBB2_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_10: -; AVX1-NEXT: vpextrb $11, %xmm3, %ecx -; AVX1-NEXT: vpextrb $11, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB2_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_12: -; AVX1-NEXT: vpextrb $10, %xmm3, %ecx -; AVX1-NEXT: vpextrb $10, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB2_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_14: -; AVX1-NEXT: vpextrb $9, %xmm3, %ecx -; AVX1-NEXT: vpextrb $9, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jo .LBB2_15 -; AVX1-NEXT: # %bb.16: -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_17 -; AVX1-NEXT: .LBB2_15: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_17: -; AVX1-NEXT: vpextrb $8, %xmm3, %ecx -; AVX1-NEXT: vpextrb $8, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB2_19 -; AVX1-NEXT: # %bb.18: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_19: -; AVX1-NEXT: vpextrb $7, %xmm3, %ecx -; AVX1-NEXT: vpextrb $7, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_21 -; AVX1-NEXT: # %bb.20: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_21: -; AVX1-NEXT: vpextrb $6, %xmm3, %ecx -; AVX1-NEXT: vpextrb $6, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_23 -; AVX1-NEXT: # %bb.22: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_23: -; AVX1-NEXT: vpextrb $5, %xmm3, %ecx -; AVX1-NEXT: vpextrb $5, %xmm1, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r11b -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_25 -; AVX1-NEXT: # %bb.24: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_25: -; AVX1-NEXT: vpextrb $4, %xmm3, %ecx -; AVX1-NEXT: vpextrb $4, %xmm1, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r13b -; AVX1-NEXT: jno .LBB2_27 -; AVX1-NEXT: # %bb.26: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB2_27: -; AVX1-NEXT: vpextrb $3, %xmm3, %ecx -; AVX1-NEXT: vpextrb $3, %xmm1, %r8d -; AVX1-NEXT: movl %r8d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r8b -; AVX1-NEXT: jno .LBB2_29 -; AVX1-NEXT: # %bb.28: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB2_29: -; AVX1-NEXT: vpextrb $2, %xmm3, %ecx -; AVX1-NEXT: vpextrb $2, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB2_31 -; AVX1-NEXT: # %bb.30: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_31: -; AVX1-NEXT: vpextrb $0, %xmm3, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: jno .LBB2_33 -; AVX1-NEXT: # %bb.32: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_33: -; AVX1-NEXT: vpextrb $1, %xmm3, %ecx -; AVX1-NEXT: vpextrb $1, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB2_35 -; AVX1-NEXT: # %bb.34: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_35: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpextrb $15, %xmm3, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_37 -; AVX1-NEXT: # %bb.36: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_37: -; AVX1-NEXT: vpextrb $14, %xmm3, %ecx -; AVX1-NEXT: vpextrb $14, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_39 -; AVX1-NEXT: # %bb.38: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_39: -; AVX1-NEXT: vpextrb $13, %xmm3, %ecx -; AVX1-NEXT: vpextrb $13, %xmm1, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r12b -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_41 -; AVX1-NEXT: # %bb.40: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB2_41: -; AVX1-NEXT: vpextrb $12, %xmm3, %ecx -; AVX1-NEXT: vpextrb $12, %xmm1, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r15b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_43 -; AVX1-NEXT: # %bb.42: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB2_43: -; AVX1-NEXT: vpextrb $11, %xmm3, %ecx -; AVX1-NEXT: vpextrb $11, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_45 -; AVX1-NEXT: # %bb.44: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_45: -; AVX1-NEXT: vpextrb $10, %xmm3, %ecx -; AVX1-NEXT: vpextrb $10, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: jno .LBB2_47 -; AVX1-NEXT: # %bb.46: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_47: -; AVX1-NEXT: vpextrb $9, %xmm3, %ecx -; AVX1-NEXT: vpextrb $9, %xmm1, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: jno .LBB2_49 -; AVX1-NEXT: # %bb.48: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_49: -; AVX1-NEXT: vpextrb $8, %xmm3, %ecx -; AVX1-NEXT: vpextrb $8, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jo .LBB2_50 -; AVX1-NEXT: # %bb.51: -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_52 -; AVX1-NEXT: .LBB2_50: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_52: -; AVX1-NEXT: vpextrb $7, %xmm3, %ecx -; AVX1-NEXT: vpextrb $7, %xmm1, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r11b -; AVX1-NEXT: jno .LBB2_54 -; AVX1-NEXT: # %bb.53: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_54: -; AVX1-NEXT: vpextrb $6, %xmm3, %ecx -; AVX1-NEXT: vpextrb $6, %xmm1, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_56 -; AVX1-NEXT: # %bb.55: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_56: -; AVX1-NEXT: vpextrb $5, %xmm3, %ecx -; AVX1-NEXT: vpextrb $5, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB2_58 -; AVX1-NEXT: # %bb.57: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_58: -; AVX1-NEXT: vpextrb $4, %xmm3, %ecx -; AVX1-NEXT: vpextrb $4, %xmm1, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r13b -; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_60 -; AVX1-NEXT: # %bb.59: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_60: -; AVX1-NEXT: vpextrb $3, %xmm3, %ecx -; AVX1-NEXT: vpextrb $3, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jo .LBB2_61 -; AVX1-NEXT: # %bb.62: -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jmp .LBB2_63 -; AVX1-NEXT: .LBB2_61: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_63: -; AVX1-NEXT: vpextrb $2, %xmm3, %ecx -; AVX1-NEXT: vpextrb $2, %xmm1, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_65 -; AVX1-NEXT: # %bb.64: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_65: -; AVX1-NEXT: vpextrb $0, %xmm3, %ecx -; AVX1-NEXT: vpextrb $0, %xmm1, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_67 -; AVX1-NEXT: # %bb.66: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_67: -; AVX1-NEXT: vpextrb $1, %xmm3, %ecx -; AVX1-NEXT: vpextrb $1, %xmm1, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_69 -; AVX1-NEXT: # %bb.68: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_69: -; AVX1-NEXT: vpextrb $15, %xmm2, %ecx -; AVX1-NEXT: vpextrb $15, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_71 -; AVX1-NEXT: # %bb.70: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_71: -; AVX1-NEXT: vpextrb $14, %xmm2, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB2_73 -; AVX1-NEXT: # %bb.72: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_73: -; AVX1-NEXT: vpextrb $13, %xmm2, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r10b -; AVX1-NEXT: jno .LBB2_75 -; AVX1-NEXT: # %bb.74: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB2_75: -; AVX1-NEXT: vpextrb $12, %xmm2, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r12b -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_77 -; AVX1-NEXT: # %bb.76: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_77: -; AVX1-NEXT: vpextrb $11, %xmm2, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r14b -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_79 -; AVX1-NEXT: # %bb.78: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB2_79: -; AVX1-NEXT: vpextrb $10, %xmm2, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r13b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_81 -; AVX1-NEXT: # %bb.80: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB2_81: -; AVX1-NEXT: vpextrb $9, %xmm2, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r8b -; AVX1-NEXT: jno .LBB2_83 -; AVX1-NEXT: # %bb.82: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: .LBB2_83: -; AVX1-NEXT: vpextrb $8, %xmm2, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r15b -; AVX1-NEXT: jno .LBB2_85 -; AVX1-NEXT: # %bb.84: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB2_85: -; AVX1-NEXT: vpextrb $7, %xmm2, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r12b -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_87 -; AVX1-NEXT: # %bb.86: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_87: -; AVX1-NEXT: vpextrb $6, %xmm2, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r12b -; AVX1-NEXT: jno .LBB2_89 -; AVX1-NEXT: # %bb.88: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB2_89: -; AVX1-NEXT: vpextrb $5, %xmm2, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB2_91 -; AVX1-NEXT: # %bb.90: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_91: -; AVX1-NEXT: vpextrb $4, %xmm2, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %esi -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %sil -; AVX1-NEXT: jno .LBB2_93 -; AVX1-NEXT: # %bb.92: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %esi -; AVX1-NEXT: .LBB2_93: -; AVX1-NEXT: vpextrb $3, %xmm2, %ecx -; AVX1-NEXT: vpextrb $3, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_95 -; AVX1-NEXT: # %bb.94: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_95: -; AVX1-NEXT: vpextrb $2, %xmm2, %ecx -; AVX1-NEXT: vpextrb $2, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB2_97 -; AVX1-NEXT: # %bb.96: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_97: -; AVX1-NEXT: vpextrb $0, %xmm2, %ecx -; AVX1-NEXT: vpextrb $0, %xmm0, %ebx -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bl -; AVX1-NEXT: jno .LBB2_99 -; AVX1-NEXT: # %bb.98: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: .LBB2_99: -; AVX1-NEXT: vpextrb $1, %xmm2, %ecx -; AVX1-NEXT: vpextrb $1, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r11b -; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_101 -; AVX1-NEXT: # %bb.100: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_101: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r11b -; AVX1-NEXT: jno .LBB2_103 -; AVX1-NEXT: # %bb.102: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB2_103: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r9b -; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_105 -; AVX1-NEXT: # %bb.104: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: # kill: def $al killed $al def $eax -; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: .LBB2_105: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r9b -; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_107 -; AVX1-NEXT: # %bb.106: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: .LBB2_107: -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_109 -; AVX1-NEXT: # %bb.108: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB2_109: -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r13b -; AVX1-NEXT: jno .LBB2_111 -; AVX1-NEXT: # %bb.110: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB2_111: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r15b -; AVX1-NEXT: jno .LBB2_113 -; AVX1-NEXT: # %bb.112: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB2_113: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r14b -; AVX1-NEXT: jno .LBB2_115 -; AVX1-NEXT: # %bb.114: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB2_115: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB2_117 -; AVX1-NEXT: # %bb.116: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB2_117: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_119 -; AVX1-NEXT: # %bb.118: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB2_119: -; AVX1-NEXT: vpextrb $6, %xmm1, %edx -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: setns %cl -; AVX1-NEXT: subb %dl, %al -; AVX1-NEXT: jno .LBB2_121 -; AVX1-NEXT: # %bb.120: -; AVX1-NEXT: addb $127, %cl -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB2_121: -; AVX1-NEXT: vpextrb $5, %xmm1, %ebx -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: subb %bl, %dl -; AVX1-NEXT: setns %dl -; AVX1-NEXT: subb %bl, %cl -; AVX1-NEXT: jno .LBB2_123 -; AVX1-NEXT: # %bb.122: -; AVX1-NEXT: addb $127, %dl -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: .LBB2_123: -; AVX1-NEXT: vpextrb $4, %xmm1, %esi -; AVX1-NEXT: vpextrb $4, %xmm0, %edx -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %dl -; AVX1-NEXT: jno .LBB2_125 -; AVX1-NEXT: # %bb.124: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: .LBB2_125: -; AVX1-NEXT: vpextrb $3, %xmm1, %esi -; AVX1-NEXT: vpextrb $3, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %r8b -; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB2_127 -; AVX1-NEXT: # %bb.126: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r8d -; AVX1-NEXT: .LBB2_127: -; AVX1-NEXT: vpextrb $2, %xmm1, %esi -; AVX1-NEXT: vpextrb $2, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %r9b -; AVX1-NEXT: jno .LBB2_129 -; AVX1-NEXT: # %bb.128: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r9d -; AVX1-NEXT: .LBB2_129: -; AVX1-NEXT: vpextrb $0, %xmm1, %esi -; AVX1-NEXT: vpextrb $0, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: movl %r11d, %r12d -; AVX1-NEXT: subb %sil, %r10b -; AVX1-NEXT: jno .LBB2_131 -; AVX1-NEXT: # %bb.130: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r10d -; AVX1-NEXT: .LBB2_131: -; AVX1-NEXT: vpextrb $1, %xmm1, %esi -; AVX1-NEXT: vpextrb $1, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %r11b -; AVX1-NEXT: jno .LBB2_133 -; AVX1-NEXT: # %bb.132: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r11d -; AVX1-NEXT: .LBB2_133: -; AVX1-NEXT: movzbl %r10b, %esi -; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: movzbl %r11b, %esi -; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r9b, %esi -; AVX1-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r8b, %esi -; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dl, %edx -; AVX1-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 -; AVX1-NEXT: addq $76, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsubsb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpsubsb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $76, %rsp -; AVX2-NEXT: vpextrb $15, %xmm3, %ecx -; AVX2-NEXT: vpextrb $15, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: jo .LBB2_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_3 -; AVX2-NEXT: .LBB2_1: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_3: -; AVX2-NEXT: vpextrb $14, %xmm3, %ecx -; AVX2-NEXT: vpextrb $14, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: jno .LBB2_5 -; AVX2-NEXT: # %bb.4: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_5: -; AVX2-NEXT: vpextrb $13, %xmm3, %ecx -; AVX2-NEXT: vpextrb $13, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: jo .LBB2_6 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_8 -; AVX2-NEXT: .LBB2_6: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_8: -; AVX2-NEXT: vpextrb $12, %xmm3, %ecx -; AVX2-NEXT: vpextrb $12, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: jno .LBB2_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_10: -; AVX2-NEXT: vpextrb $11, %xmm3, %ecx -; AVX2-NEXT: vpextrb $11, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB2_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_12: -; AVX2-NEXT: vpextrb $10, %xmm3, %ecx -; AVX2-NEXT: vpextrb $10, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB2_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_14: -; AVX2-NEXT: vpextrb $9, %xmm3, %ecx -; AVX2-NEXT: vpextrb $9, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jo .LBB2_15 -; AVX2-NEXT: # %bb.16: -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_17 -; AVX2-NEXT: .LBB2_15: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_17: -; AVX2-NEXT: vpextrb $8, %xmm3, %ecx -; AVX2-NEXT: vpextrb $8, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB2_19 -; AVX2-NEXT: # %bb.18: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_19: -; AVX2-NEXT: vpextrb $7, %xmm3, %ecx -; AVX2-NEXT: vpextrb $7, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_21 -; AVX2-NEXT: # %bb.20: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_21: -; AVX2-NEXT: vpextrb $6, %xmm3, %ecx -; AVX2-NEXT: vpextrb $6, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_23 -; AVX2-NEXT: # %bb.22: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_23: -; AVX2-NEXT: vpextrb $5, %xmm3, %ecx -; AVX2-NEXT: vpextrb $5, %xmm1, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r11b -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_25 -; AVX2-NEXT: # %bb.24: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_25: -; AVX2-NEXT: vpextrb $4, %xmm3, %ecx -; AVX2-NEXT: vpextrb $4, %xmm1, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r13b -; AVX2-NEXT: jno .LBB2_27 -; AVX2-NEXT: # %bb.26: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB2_27: -; AVX2-NEXT: vpextrb $3, %xmm3, %ecx -; AVX2-NEXT: vpextrb $3, %xmm1, %r8d -; AVX2-NEXT: movl %r8d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r8b -; AVX2-NEXT: jno .LBB2_29 -; AVX2-NEXT: # %bb.28: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB2_29: -; AVX2-NEXT: vpextrb $2, %xmm3, %ecx -; AVX2-NEXT: vpextrb $2, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB2_31 -; AVX2-NEXT: # %bb.30: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_31: -; AVX2-NEXT: vpextrb $0, %xmm3, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: jno .LBB2_33 -; AVX2-NEXT: # %bb.32: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_33: -; AVX2-NEXT: vpextrb $1, %xmm3, %ecx -; AVX2-NEXT: vpextrb $1, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB2_35 -; AVX2-NEXT: # %bb.34: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_35: -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpextrb $15, %xmm3, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_37 -; AVX2-NEXT: # %bb.36: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_37: -; AVX2-NEXT: vpextrb $14, %xmm3, %ecx -; AVX2-NEXT: vpextrb $14, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_39 -; AVX2-NEXT: # %bb.38: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_39: -; AVX2-NEXT: vpextrb $13, %xmm3, %ecx -; AVX2-NEXT: vpextrb $13, %xmm1, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r12b -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_41 -; AVX2-NEXT: # %bb.40: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB2_41: -; AVX2-NEXT: vpextrb $12, %xmm3, %ecx -; AVX2-NEXT: vpextrb $12, %xmm1, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r15b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_43 -; AVX2-NEXT: # %bb.42: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB2_43: -; AVX2-NEXT: vpextrb $11, %xmm3, %ecx -; AVX2-NEXT: vpextrb $11, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_45 -; AVX2-NEXT: # %bb.44: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_45: -; AVX2-NEXT: vpextrb $10, %xmm3, %ecx -; AVX2-NEXT: vpextrb $10, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: jno .LBB2_47 -; AVX2-NEXT: # %bb.46: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_47: -; AVX2-NEXT: vpextrb $9, %xmm3, %ecx -; AVX2-NEXT: vpextrb $9, %xmm1, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: jno .LBB2_49 -; AVX2-NEXT: # %bb.48: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_49: -; AVX2-NEXT: vpextrb $8, %xmm3, %ecx -; AVX2-NEXT: vpextrb $8, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jo .LBB2_50 -; AVX2-NEXT: # %bb.51: -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_52 -; AVX2-NEXT: .LBB2_50: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_52: -; AVX2-NEXT: vpextrb $7, %xmm3, %ecx -; AVX2-NEXT: vpextrb $7, %xmm1, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r11b -; AVX2-NEXT: jno .LBB2_54 -; AVX2-NEXT: # %bb.53: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_54: -; AVX2-NEXT: vpextrb $6, %xmm3, %ecx -; AVX2-NEXT: vpextrb $6, %xmm1, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_56 -; AVX2-NEXT: # %bb.55: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_56: -; AVX2-NEXT: vpextrb $5, %xmm3, %ecx -; AVX2-NEXT: vpextrb $5, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB2_58 -; AVX2-NEXT: # %bb.57: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_58: -; AVX2-NEXT: vpextrb $4, %xmm3, %ecx -; AVX2-NEXT: vpextrb $4, %xmm1, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r13b -; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_60 -; AVX2-NEXT: # %bb.59: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_60: -; AVX2-NEXT: vpextrb $3, %xmm3, %ecx -; AVX2-NEXT: vpextrb $3, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jo .LBB2_61 -; AVX2-NEXT: # %bb.62: -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jmp .LBB2_63 -; AVX2-NEXT: .LBB2_61: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_63: -; AVX2-NEXT: vpextrb $2, %xmm3, %ecx -; AVX2-NEXT: vpextrb $2, %xmm1, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_65 -; AVX2-NEXT: # %bb.64: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_65: -; AVX2-NEXT: vpextrb $0, %xmm3, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_67 -; AVX2-NEXT: # %bb.66: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_67: -; AVX2-NEXT: vpextrb $1, %xmm3, %ecx -; AVX2-NEXT: vpextrb $1, %xmm1, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_69 -; AVX2-NEXT: # %bb.68: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_69: -; AVX2-NEXT: vpextrb $15, %xmm2, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_71 -; AVX2-NEXT: # %bb.70: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_71: -; AVX2-NEXT: vpextrb $14, %xmm2, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB2_73 -; AVX2-NEXT: # %bb.72: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_73: -; AVX2-NEXT: vpextrb $13, %xmm2, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r10b -; AVX2-NEXT: jno .LBB2_75 -; AVX2-NEXT: # %bb.74: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB2_75: -; AVX2-NEXT: vpextrb $12, %xmm2, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r12b -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_77 -; AVX2-NEXT: # %bb.76: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_77: -; AVX2-NEXT: vpextrb $11, %xmm2, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r14b -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_79 -; AVX2-NEXT: # %bb.78: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB2_79: -; AVX2-NEXT: vpextrb $10, %xmm2, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r13b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_81 -; AVX2-NEXT: # %bb.80: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB2_81: -; AVX2-NEXT: vpextrb $9, %xmm2, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r8b -; AVX2-NEXT: jno .LBB2_83 -; AVX2-NEXT: # %bb.82: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: .LBB2_83: -; AVX2-NEXT: vpextrb $8, %xmm2, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r15b -; AVX2-NEXT: jno .LBB2_85 -; AVX2-NEXT: # %bb.84: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB2_85: -; AVX2-NEXT: vpextrb $7, %xmm2, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r12b -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_87 -; AVX2-NEXT: # %bb.86: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_87: -; AVX2-NEXT: vpextrb $6, %xmm2, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r12b -; AVX2-NEXT: jno .LBB2_89 -; AVX2-NEXT: # %bb.88: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB2_89: -; AVX2-NEXT: vpextrb $5, %xmm2, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB2_91 -; AVX2-NEXT: # %bb.90: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_91: -; AVX2-NEXT: vpextrb $4, %xmm2, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %esi -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %sil -; AVX2-NEXT: jno .LBB2_93 -; AVX2-NEXT: # %bb.92: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %esi -; AVX2-NEXT: .LBB2_93: -; AVX2-NEXT: vpextrb $3, %xmm2, %ecx -; AVX2-NEXT: vpextrb $3, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_95 -; AVX2-NEXT: # %bb.94: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_95: -; AVX2-NEXT: vpextrb $2, %xmm2, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB2_97 -; AVX2-NEXT: # %bb.96: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_97: -; AVX2-NEXT: vpextrb $0, %xmm2, %ecx -; AVX2-NEXT: vpextrb $0, %xmm0, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bl -; AVX2-NEXT: jno .LBB2_99 -; AVX2-NEXT: # %bb.98: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: .LBB2_99: -; AVX2-NEXT: vpextrb $1, %xmm2, %ecx -; AVX2-NEXT: vpextrb $1, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r11b -; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_101 -; AVX2-NEXT: # %bb.100: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_101: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r11b -; AVX2-NEXT: jno .LBB2_103 -; AVX2-NEXT: # %bb.102: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB2_103: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r9b -; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_105 -; AVX2-NEXT: # %bb.104: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: # kill: def $al killed $al def $eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: .LBB2_105: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r9b -; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_107 -; AVX2-NEXT: # %bb.106: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: .LBB2_107: -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_109 -; AVX2-NEXT: # %bb.108: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB2_109: -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r13b -; AVX2-NEXT: jno .LBB2_111 -; AVX2-NEXT: # %bb.110: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB2_111: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r15b -; AVX2-NEXT: jno .LBB2_113 -; AVX2-NEXT: # %bb.112: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB2_113: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r14b -; AVX2-NEXT: jno .LBB2_115 -; AVX2-NEXT: # %bb.114: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB2_115: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB2_117 -; AVX2-NEXT: # %bb.116: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB2_117: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_119 -; AVX2-NEXT: # %bb.118: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB2_119: -; AVX2-NEXT: vpextrb $6, %xmm1, %edx -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: setns %cl -; AVX2-NEXT: subb %dl, %al -; AVX2-NEXT: jno .LBB2_121 -; AVX2-NEXT: # %bb.120: -; AVX2-NEXT: addb $127, %cl -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB2_121: -; AVX2-NEXT: vpextrb $5, %xmm1, %ebx -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: subb %bl, %dl -; AVX2-NEXT: setns %dl -; AVX2-NEXT: subb %bl, %cl -; AVX2-NEXT: jno .LBB2_123 -; AVX2-NEXT: # %bb.122: -; AVX2-NEXT: addb $127, %dl -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: .LBB2_123: -; AVX2-NEXT: vpextrb $4, %xmm1, %esi -; AVX2-NEXT: vpextrb $4, %xmm0, %edx -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %dl -; AVX2-NEXT: jno .LBB2_125 -; AVX2-NEXT: # %bb.124: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: .LBB2_125: -; AVX2-NEXT: vpextrb $3, %xmm1, %esi -; AVX2-NEXT: vpextrb $3, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %r8b -; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB2_127 -; AVX2-NEXT: # %bb.126: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r8d -; AVX2-NEXT: .LBB2_127: -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: vpextrb $2, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %r9b -; AVX2-NEXT: jno .LBB2_129 -; AVX2-NEXT: # %bb.128: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r9d -; AVX2-NEXT: .LBB2_129: -; AVX2-NEXT: vpextrb $0, %xmm1, %esi -; AVX2-NEXT: vpextrb $0, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: movl %r11d, %r12d -; AVX2-NEXT: subb %sil, %r10b -; AVX2-NEXT: jno .LBB2_131 -; AVX2-NEXT: # %bb.130: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r10d -; AVX2-NEXT: .LBB2_131: -; AVX2-NEXT: vpextrb $1, %xmm1, %esi -; AVX2-NEXT: vpextrb $1, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %r11b -; AVX2-NEXT: jno .LBB2_133 -; AVX2-NEXT: # %bb.132: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r11d -; AVX2-NEXT: .LBB2_133: -; AVX2-NEXT: movzbl %r10b, %esi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: movzbl %r11b, %esi -; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r9b, %esi -; AVX2-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r8b, %esi -; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX2-NEXT: addq $76, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpsubsb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubsb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $76, %rsp -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vpextrb $15, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: jo .LBB2_1 -; AVX512-NEXT: # %bb.2: -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_3 -; AVX512-NEXT: .LBB2_1: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_3: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: jno .LBB2_5 -; AVX512-NEXT: # %bb.4: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_5: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: jo .LBB2_6 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_8 -; AVX512-NEXT: .LBB2_6: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_8: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: jno .LBB2_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_10: -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB2_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_12: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB2_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_14: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jo .LBB2_15 -; AVX512-NEXT: # %bb.16: -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_17 -; AVX512-NEXT: .LBB2_15: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_17: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB2_19 -; AVX512-NEXT: # %bb.18: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_19: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_21 -; AVX512-NEXT: # %bb.20: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_21: -; AVX512-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512-NEXT: vpextrb $6, %xmm0, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_23 -; AVX512-NEXT: # %bb.22: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_23: -; AVX512-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512-NEXT: vpextrb $5, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r11b -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_25 -; AVX512-NEXT: # %bb.24: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_25: -; AVX512-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512-NEXT: vpextrb $4, %xmm0, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r13b -; AVX512-NEXT: jno .LBB2_27 -; AVX512-NEXT: # %bb.26: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB2_27: -; AVX512-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512-NEXT: vpextrb $3, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r8b -; AVX512-NEXT: jno .LBB2_29 -; AVX512-NEXT: # %bb.28: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB2_29: -; AVX512-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512-NEXT: vpextrb $2, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB2_31 -; AVX512-NEXT: # %bb.30: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_31: -; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: jno .LBB2_33 -; AVX512-NEXT: # %bb.32: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_33: -; AVX512-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512-NEXT: vpextrb $1, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB2_35 -; AVX512-NEXT: # %bb.34: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_35: -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrb $15, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_37 -; AVX512-NEXT: # %bb.36: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_37: -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_39 -; AVX512-NEXT: # %bb.38: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_39: -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r12b -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_41 -; AVX512-NEXT: # %bb.40: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: .LBB2_41: -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r15b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_43 -; AVX512-NEXT: # %bb.42: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB2_43: -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_45 -; AVX512-NEXT: # %bb.44: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_45: -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: jno .LBB2_47 -; AVX512-NEXT: # %bb.46: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_47: -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: jno .LBB2_49 -; AVX512-NEXT: # %bb.48: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_49: -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, (%rsp) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jo .LBB2_50 -; AVX512-NEXT: # %bb.51: -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_52 -; AVX512-NEXT: .LBB2_50: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_52: -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r11b -; AVX512-NEXT: jno .LBB2_54 -; AVX512-NEXT: # %bb.53: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_54: -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_56 -; AVX512-NEXT: # %bb.55: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_56: -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB2_58 -; AVX512-NEXT: # %bb.57: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_58: -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r13b -; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_60 -; AVX512-NEXT: # %bb.59: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_60: -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jo .LBB2_61 -; AVX512-NEXT: # %bb.62: -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jmp .LBB2_63 -; AVX512-NEXT: .LBB2_61: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_63: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_65 -; AVX512-NEXT: # %bb.64: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_65: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_67 -; AVX512-NEXT: # %bb.66: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_67: -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vpextrb $1, %xmm3, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_69 -; AVX512-NEXT: # %bb.68: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_69: -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vpextrb $15, %xmm2, %ecx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vpextrb $15, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_71 -; AVX512-NEXT: # %bb.70: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_71: -; AVX512-NEXT: vpextrb $14, %xmm2, %ecx -; AVX512-NEXT: vpextrb $14, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB2_73 -; AVX512-NEXT: # %bb.72: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_73: -; AVX512-NEXT: vpextrb $13, %xmm2, %ecx -; AVX512-NEXT: vpextrb $13, %xmm3, %r10d -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r10b -; AVX512-NEXT: jno .LBB2_75 -; AVX512-NEXT: # %bb.74: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r10d -; AVX512-NEXT: .LBB2_75: -; AVX512-NEXT: vpextrb $12, %xmm2, %ecx -; AVX512-NEXT: vpextrb $12, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r12b -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_77 -; AVX512-NEXT: # %bb.76: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_77: -; AVX512-NEXT: vpextrb $11, %xmm2, %ecx -; AVX512-NEXT: vpextrb $11, %xmm3, %r14d -; AVX512-NEXT: movl %r14d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r14b -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_79 -; AVX512-NEXT: # %bb.78: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: .LBB2_79: -; AVX512-NEXT: vpextrb $10, %xmm2, %ecx -; AVX512-NEXT: vpextrb $10, %xmm3, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r13b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_81 -; AVX512-NEXT: # %bb.80: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB2_81: -; AVX512-NEXT: vpextrb $9, %xmm2, %ecx -; AVX512-NEXT: vpextrb $9, %xmm3, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r8b -; AVX512-NEXT: jno .LBB2_83 -; AVX512-NEXT: # %bb.82: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: .LBB2_83: -; AVX512-NEXT: vpextrb $8, %xmm2, %ecx -; AVX512-NEXT: vpextrb $8, %xmm3, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r15b -; AVX512-NEXT: jno .LBB2_85 -; AVX512-NEXT: # %bb.84: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB2_85: -; AVX512-NEXT: vpextrb $7, %xmm2, %ecx -; AVX512-NEXT: vpextrb $7, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r12b -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_87 -; AVX512-NEXT: # %bb.86: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_87: -; AVX512-NEXT: vpextrb $6, %xmm2, %ecx -; AVX512-NEXT: vpextrb $6, %xmm3, %r12d -; AVX512-NEXT: movl %r12d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r12b -; AVX512-NEXT: jno .LBB2_89 -; AVX512-NEXT: # %bb.88: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: .LBB2_89: -; AVX512-NEXT: vpextrb $5, %xmm2, %ecx -; AVX512-NEXT: vpextrb $5, %xmm3, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB2_91 -; AVX512-NEXT: # %bb.90: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_91: -; AVX512-NEXT: vpextrb $4, %xmm2, %ecx -; AVX512-NEXT: vpextrb $4, %xmm3, %esi -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %sil -; AVX512-NEXT: jno .LBB2_93 -; AVX512-NEXT: # %bb.92: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: .LBB2_93: -; AVX512-NEXT: vpextrb $3, %xmm2, %ecx -; AVX512-NEXT: vpextrb $3, %xmm3, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_95 -; AVX512-NEXT: # %bb.94: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_95: -; AVX512-NEXT: vpextrb $2, %xmm2, %ecx -; AVX512-NEXT: vpextrb $2, %xmm3, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: jno .LBB2_97 -; AVX512-NEXT: # %bb.96: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_97: -; AVX512-NEXT: vpextrb $0, %xmm2, %ecx -; AVX512-NEXT: vpextrb $0, %xmm3, %ebx -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bl -; AVX512-NEXT: jno .LBB2_99 -; AVX512-NEXT: # %bb.98: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: .LBB2_99: -; AVX512-NEXT: vpextrb $1, %xmm2, %ecx -; AVX512-NEXT: vpextrb $1, %xmm3, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r11b -; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_101 -; AVX512-NEXT: # %bb.100: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_101: -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vpextrb $15, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r11b -; AVX512-NEXT: jno .LBB2_103 -; AVX512-NEXT: # %bb.102: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r11d -; AVX512-NEXT: .LBB2_103: -; AVX512-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512-NEXT: vpextrb $14, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r9b -; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_105 -; AVX512-NEXT: # %bb.104: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: .LBB2_105: -; AVX512-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512-NEXT: vpextrb $13, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r9b -; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_107 -; AVX512-NEXT: # %bb.106: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r9d -; AVX512-NEXT: .LBB2_107: -; AVX512-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512-NEXT: vpextrb $12, %xmm0, %edx -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_109 -; AVX512-NEXT: # %bb.108: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: .LBB2_109: -; AVX512-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512-NEXT: vpextrb $11, %xmm0, %r13d -; AVX512-NEXT: movl %r13d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r13b -; AVX512-NEXT: jno .LBB2_111 -; AVX512-NEXT: # %bb.110: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: .LBB2_111: -; AVX512-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512-NEXT: vpextrb $10, %xmm0, %r15d -; AVX512-NEXT: movl %r15d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r15b -; AVX512-NEXT: jno .LBB2_113 -; AVX512-NEXT: # %bb.112: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: .LBB2_113: -; AVX512-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512-NEXT: vpextrb $9, %xmm0, %r14d -; AVX512-NEXT: movl %r14d, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %r14b -; AVX512-NEXT: jno .LBB2_115 -; AVX512-NEXT: # %bb.114: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: .LBB2_115: -; AVX512-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512-NEXT: vpextrb $8, %xmm0, %ebp -; AVX512-NEXT: movl %ebp, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %bpl -; AVX512-NEXT: jno .LBB2_117 -; AVX512-NEXT: # %bb.116: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: .LBB2_117: -; AVX512-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512-NEXT: vpextrb $7, %xmm0, %edi -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: setns %al -; AVX512-NEXT: subb %cl, %dil -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_119 -; AVX512-NEXT: # %bb.118: -; AVX512-NEXT: addb $127, %al -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: .LBB2_119: -; AVX512-NEXT: vpextrb $6, %xmm1, %edx -; AVX512-NEXT: vpextrb $6, %xmm0, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %dl, %al -; AVX512-NEXT: jno .LBB2_121 -; AVX512-NEXT: # %bb.120: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: .LBB2_121: -; AVX512-NEXT: vpextrb $5, %xmm1, %ebx -; AVX512-NEXT: vpextrb $5, %xmm0, %ecx -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: subb %bl, %dl -; AVX512-NEXT: setns %dl -; AVX512-NEXT: subb %bl, %cl -; AVX512-NEXT: jno .LBB2_123 -; AVX512-NEXT: # %bb.122: -; AVX512-NEXT: addb $127, %dl -; AVX512-NEXT: movl %edx, %ecx -; AVX512-NEXT: .LBB2_123: -; AVX512-NEXT: vpextrb $4, %xmm1, %esi -; AVX512-NEXT: vpextrb $4, %xmm0, %edx -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %dl -; AVX512-NEXT: jno .LBB2_125 -; AVX512-NEXT: # %bb.124: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: .LBB2_125: -; AVX512-NEXT: vpextrb $3, %xmm1, %esi -; AVX512-NEXT: vpextrb $3, %xmm0, %r8d -; AVX512-NEXT: movl %r8d, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %r8b -; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB2_127 -; AVX512-NEXT: # %bb.126: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r8d -; AVX512-NEXT: .LBB2_127: -; AVX512-NEXT: vpextrb $2, %xmm1, %esi -; AVX512-NEXT: vpextrb $2, %xmm0, %r9d -; AVX512-NEXT: movl %r9d, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %r9b -; AVX512-NEXT: jno .LBB2_129 -; AVX512-NEXT: # %bb.128: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r9d -; AVX512-NEXT: .LBB2_129: -; AVX512-NEXT: vpextrb $0, %xmm1, %esi -; AVX512-NEXT: vpextrb $0, %xmm0, %r10d -; AVX512-NEXT: movl %r10d, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: movl %r11d, %r12d -; AVX512-NEXT: subb %sil, %r10b -; AVX512-NEXT: jno .LBB2_131 -; AVX512-NEXT: # %bb.130: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r10d -; AVX512-NEXT: .LBB2_131: -; AVX512-NEXT: vpextrb $1, %xmm1, %esi -; AVX512-NEXT: vpextrb $1, %xmm0, %r11d -; AVX512-NEXT: movl %r11d, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %r11b -; AVX512-NEXT: jno .LBB2_133 -; AVX512-NEXT: # %bb.132: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r11d -; AVX512-NEXT: .LBB2_133: -; AVX512-NEXT: movzbl %r10b, %esi -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: movzbl %r11b, %esi -; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r9b, %esi -; AVX512-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r8b, %esi -; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dl, %edx -; AVX512-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %cl, %ecx -; AVX512-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %al, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %dil, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %bpl, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r14b, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r15b, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r13b, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl %r12b, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vmovd %eax, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: addq $76, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpsubsb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { -; SSE2-LABEL: v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movd %xmm0, %r8d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r8d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r8w -; SSE2-NEXT: cmovol %ecx, %r8d -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %edi -; SSE2-NEXT: subw %ax, %di -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %edi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %di -; SSE2-NEXT: cmovol %ecx, %edi -; SSE2-NEXT: pextrw $5, %xmm1, %ecx -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: subw %cx, %ax -; SSE2-NEXT: cmovol %edx, %eax -; SSE2-NEXT: pextrw $6, %xmm1, %edx -; SSE2-NEXT: pextrw $6, %xmm0, %ecx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: subw %dx, %bx -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: subw %dx, %cx -; SSE2-NEXT: cmovol %esi, %ecx -; SSE2-NEXT: pextrw $7, %xmm1, %edx -; SSE2-NEXT: pextrw $7, %xmm0, %esi -; SSE2-NEXT: xorl %ebx, %ebx -; SSE2-NEXT: movl %esi, %ebp -; SSE2-NEXT: subw %dx, %bp -; SSE2-NEXT: setns %bl -; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE2-NEXT: subw %dx, %si -; SSE2-NEXT: cmovol %ebx, %esi -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %r11d, %xmm0 -; SSE2-NEXT: movd %r10d, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movd %xmm0, %r8d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r8d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r8w -; SSSE3-NEXT: cmovol %ecx, %r8d -; SSSE3-NEXT: pextrw $1, %xmm1, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: pextrw $2, %xmm1, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: pextrw $3, %xmm1, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %edi -; SSSE3-NEXT: subw %ax, %di -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: pextrw $4, %xmm1, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %edi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %di -; SSSE3-NEXT: cmovol %ecx, %edi -; SSSE3-NEXT: pextrw $5, %xmm1, %ecx -; SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: subw %cx, %ax -; SSSE3-NEXT: cmovol %edx, %eax -; SSSE3-NEXT: pextrw $6, %xmm1, %edx -; SSSE3-NEXT: pextrw $6, %xmm0, %ecx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: subw %dx, %bx -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: subw %dx, %cx -; SSSE3-NEXT: cmovol %esi, %ecx -; SSSE3-NEXT: pextrw $7, %xmm1, %edx -; SSSE3-NEXT: pextrw $7, %xmm0, %esi -; SSSE3-NEXT: xorl %ebx, %ebx -; SSSE3-NEXT: movl %esi, %ebp -; SSSE3-NEXT: subw %dx, %bp -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSSE3-NEXT: subw %dx, %si -; SSSE3-NEXT: cmovol %ebx, %esi -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movd %edi, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %r11d, %xmm0 -; SSSE3-NEXT: movd %r10d, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrw $7, %xmm1, %eax -; SSE41-NEXT: pextrw $7, %xmm0, %r8d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r8d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r8w -; SSE41-NEXT: cmovol %ecx, %r8d -; SSE41-NEXT: pextrw $6, %xmm1, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %r9d -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %r9d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r9w -; SSE41-NEXT: cmovol %edx, %r9d -; SSE41-NEXT: pextrw $5, %xmm1, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %r10d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r10d, %edi -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r10w -; SSE41-NEXT: cmovol %esi, %r10d -; SSE41-NEXT: pextrw $4, %xmm1, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %r11d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r11d, %ecx -; SSE41-NEXT: subw %ax, %cx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r11w -; SSE41-NEXT: cmovol %edi, %r11d -; SSE41-NEXT: pextrw $3, %xmm1, %eax -; SSE41-NEXT: pextrw $3, %xmm0, %edi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: cmovol %ecx, %edi -; SSE41-NEXT: pextrw $2, %xmm1, %ecx -; SSE41-NEXT: pextrw $2, %xmm0, %eax -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %eax, %esi -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: subw %cx, %ax -; SSE41-NEXT: cmovol %edx, %eax -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: movd %xmm0, %edx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: subw %cx, %bx -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: subw %cx, %dx -; SSE41-NEXT: cmovol %esi, %edx -; SSE41-NEXT: pextrw $1, %xmm1, %ecx -; SSE41-NEXT: pextrw $1, %xmm0, %esi -; SSE41-NEXT: xorl %ebx, %ebx -; SSE41-NEXT: movl %esi, %ebp -; SSE41-NEXT: subw %cx, %bp -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE41-NEXT: subw %cx, %si -; SSE41-NEXT: cmovol %ebx, %esi -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrw $1, %esi, %xmm0 -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 -; SSE41-NEXT: pinsrw $3, %edi, %xmm0 -; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 -; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 -; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 -; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psubsw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v8i16: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrw $7, %xmm1, %eax -; AVX-NEXT: vpextrw $7, %xmm0, %r8d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r8d, %edx -; AVX-NEXT: subw %ax, %dx -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r8w -; AVX-NEXT: cmovol %ecx, %r8d -; AVX-NEXT: vpextrw $6, %xmm1, %eax -; AVX-NEXT: vpextrw $6, %xmm0, %r9d -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: movl %r9d, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %dl -; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r9w -; AVX-NEXT: cmovol %edx, %r9d -; AVX-NEXT: vpextrw $5, %xmm1, %eax -; AVX-NEXT: vpextrw $5, %xmm0, %r10d -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %r10d, %edi -; AVX-NEXT: subw %ax, %di -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r10w -; AVX-NEXT: cmovol %esi, %r10d -; AVX-NEXT: vpextrw $4, %xmm1, %eax -; AVX-NEXT: vpextrw $4, %xmm0, %r11d -; AVX-NEXT: xorl %edi, %edi -; AVX-NEXT: movl %r11d, %ecx -; AVX-NEXT: subw %ax, %cx -; AVX-NEXT: setns %dil -; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r11w -; AVX-NEXT: cmovol %edi, %r11d -; AVX-NEXT: vpextrw $3, %xmm1, %eax -; AVX-NEXT: vpextrw $3, %xmm0, %edi -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: subw %ax, %dx -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %di -; AVX-NEXT: cmovol %ecx, %edi -; AVX-NEXT: vpextrw $2, %xmm1, %ecx -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: movl %eax, %esi -; AVX-NEXT: subw %cx, %si -; AVX-NEXT: setns %dl -; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX-NEXT: subw %cx, %ax -; AVX-NEXT: cmovol %edx, %eax -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: subw %cx, %bx -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: subw %cx, %dx -; AVX-NEXT: cmovol %esi, %edx -; AVX-NEXT: vpextrw $1, %xmm1, %ecx -; AVX-NEXT: vpextrw $1, %xmm0, %esi -; AVX-NEXT: xorl %ebx, %ebx -; AVX-NEXT: movl %esi, %ebp -; AVX-NEXT: subw %cx, %bp -; AVX-NEXT: setns %bl -; AVX-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX-NEXT: subw %cx, %si -; AVX-NEXT: cmovol %ebx, %esi -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %rbp +; AVX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %z } define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { -; SSE2-LABEL: v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm3, %eax -; SSE2-NEXT: pextrw $1, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: pextrw $2, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm3, %eax -; SSE2-NEXT: pextrw $3, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: pextrw $4, %xmm1, %r14d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r14d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r14w -; SSE2-NEXT: cmovol %ecx, %r14d -; SSE2-NEXT: pextrw $5, %xmm3, %eax -; SSE2-NEXT: pextrw $5, %xmm1, %r15d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r15d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r15w -; SSE2-NEXT: cmovol %ecx, %r15d -; SSE2-NEXT: pextrw $6, %xmm3, %eax -; SSE2-NEXT: pextrw $6, %xmm1, %r12d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r12d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r12w -; SSE2-NEXT: cmovol %ecx, %r12d -; SSE2-NEXT: pextrw $7, %xmm3, %eax -; SSE2-NEXT: pextrw $7, %xmm1, %r13d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r13d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r13w -; SSE2-NEXT: cmovol %ecx, %r13d -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movd %xmm0, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: pextrw $1, %xmm2, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %ebx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %bx -; SSE2-NEXT: cmovol %ecx, %ebx -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %ebp -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebp, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %bp -; SSE2-NEXT: cmovol %ecx, %ebp -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %edi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %di -; SSE2-NEXT: cmovol %ecx, %edi -; SSE2-NEXT: pextrw $4, %xmm2, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: subw %cx, %r8w -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: subw %cx, %ax -; SSE2-NEXT: cmovol %edx, %eax -; SSE2-NEXT: pextrw $5, %xmm2, %r8d -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %ecx, %r9d -; SSE2-NEXT: subw %r8w, %r9w -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: subw %r8w, %cx -; SSE2-NEXT: cmovol %edx, %ecx -; SSE2-NEXT: pextrw $6, %xmm2, %r8d -; SSE2-NEXT: pextrw $6, %xmm0, %r9d -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movl %r9d, %r10d -; SSE2-NEXT: subw %r8w, %r10w -; SSE2-NEXT: setns %dl -; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE2-NEXT: subw %r8w, %r9w -; SSE2-NEXT: cmovol %edx, %r9d -; SSE2-NEXT: pextrw $7, %xmm2, %r8d -; SSE2-NEXT: pextrw $7, %xmm0, %edx -; SSE2-NEXT: xorl %r10d, %r10d -; SSE2-NEXT: movl %edx, %r11d -; SSE2-NEXT: subw %r8w, %r11w -; SSE2-NEXT: setns %r10b -; SSE2-NEXT: addl $32767, %r10d # imm = 0x7FFF -; SSE2-NEXT: subw %r8w, %dx -; SSE2-NEXT: cmovol %r10d, %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movd %ecx, %xmm9 -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movd %edi, %xmm10 -; SSE2-NEXT: movd %ebp, %xmm7 -; SSE2-NEXT: movd %ebx, %xmm11 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %r13d, %xmm12 -; SSE2-NEXT: movd %r12d, %xmm6 -; SSE2-NEXT: movd %r15d, %xmm13 -; SSE2-NEXT: movd %r14d, %xmm5 -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm3, %eax -; SSSE3-NEXT: pextrw $1, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm3, %eax -; SSSE3-NEXT: pextrw $2, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm3, %eax -; SSSE3-NEXT: pextrw $3, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm3, %eax -; SSSE3-NEXT: pextrw $4, %xmm1, %r14d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r14d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r14w -; SSSE3-NEXT: cmovol %ecx, %r14d -; SSSE3-NEXT: pextrw $5, %xmm3, %eax -; SSSE3-NEXT: pextrw $5, %xmm1, %r15d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r15d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r15w -; SSSE3-NEXT: cmovol %ecx, %r15d -; SSSE3-NEXT: pextrw $6, %xmm3, %eax -; SSSE3-NEXT: pextrw $6, %xmm1, %r12d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r12d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r12w -; SSSE3-NEXT: cmovol %ecx, %r12d -; SSSE3-NEXT: pextrw $7, %xmm3, %eax -; SSSE3-NEXT: pextrw $7, %xmm1, %r13d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r13d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r13w -; SSSE3-NEXT: cmovol %ecx, %r13d -; SSSE3-NEXT: movd %xmm2, %eax -; SSSE3-NEXT: movd %xmm0, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %ebx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %bx -; SSSE3-NEXT: cmovol %ecx, %ebx -; SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %ebp -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebp, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %bp -; SSSE3-NEXT: cmovol %ecx, %ebp -; SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %edi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %di -; SSSE3-NEXT: cmovol %ecx, %edi -; SSSE3-NEXT: pextrw $4, %xmm2, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: subw %cx, %r8w -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: subw %cx, %ax -; SSSE3-NEXT: cmovol %edx, %eax -; SSSE3-NEXT: pextrw $5, %xmm2, %r8d -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %ecx, %r9d -; SSSE3-NEXT: subw %r8w, %r9w -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: subw %r8w, %cx -; SSSE3-NEXT: cmovol %edx, %ecx -; SSSE3-NEXT: pextrw $6, %xmm2, %r8d -; SSSE3-NEXT: pextrw $6, %xmm0, %r9d -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movl %r9d, %r10d -; SSSE3-NEXT: subw %r8w, %r10w -; SSSE3-NEXT: setns %dl -; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSSE3-NEXT: subw %r8w, %r9w -; SSSE3-NEXT: cmovol %edx, %r9d -; SSSE3-NEXT: pextrw $7, %xmm2, %r8d -; SSSE3-NEXT: pextrw $7, %xmm0, %edx -; SSSE3-NEXT: xorl %r10d, %r10d -; SSSE3-NEXT: movl %edx, %r11d -; SSSE3-NEXT: subw %r8w, %r11w -; SSSE3-NEXT: setns %r10b -; SSSE3-NEXT: addl $32767, %r10d # imm = 0x7FFF -; SSSE3-NEXT: subw %r8w, %dx -; SSSE3-NEXT: cmovol %r10d, %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movd %ecx, %xmm9 -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm10 -; SSSE3-NEXT: movd %ebp, %xmm7 -; SSSE3-NEXT: movd %ebx, %xmm11 -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: movd %r13d, %xmm12 -; SSSE3-NEXT: movd %r12d, %xmm6 -; SSSE3-NEXT: movd %r15d, %xmm13 -; SSSE3-NEXT: movd %r14d, %xmm5 -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrw $7, %xmm3, %eax -; SSE41-NEXT: pextrw $7, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm3, %eax -; SSE41-NEXT: pextrw $6, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm3, %eax -; SSE41-NEXT: pextrw $5, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm3, %eax -; SSE41-NEXT: pextrw $4, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm3, %eax -; SSE41-NEXT: pextrw $3, %xmm1, %r14d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r14d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r14w -; SSE41-NEXT: cmovol %ecx, %r14d -; SSE41-NEXT: pextrw $2, %xmm3, %eax -; SSE41-NEXT: pextrw $2, %xmm1, %r15d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r15d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r15w -; SSE41-NEXT: cmovol %ecx, %r15d -; SSE41-NEXT: movd %xmm3, %eax -; SSE41-NEXT: movd %xmm1, %r12d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r12d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r12w -; SSE41-NEXT: cmovol %ecx, %r12d -; SSE41-NEXT: pextrw $1, %xmm3, %eax -; SSE41-NEXT: pextrw $1, %xmm1, %r13d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r13d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r13w -; SSE41-NEXT: cmovol %ecx, %r13d -; SSE41-NEXT: pextrw $7, %xmm2, %eax -; SSE41-NEXT: pextrw $7, %xmm0, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edi -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: pextrw $6, %xmm2, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %ebx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebx, %edi -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %bx -; SSE41-NEXT: cmovol %ecx, %ebx -; SSE41-NEXT: pextrw $5, %xmm2, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %ebp -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebp, %edi -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %bp -; SSE41-NEXT: cmovol %ecx, %ebp -; SSE41-NEXT: pextrw $4, %xmm2, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %edi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: cmovol %ecx, %edi -; SSE41-NEXT: pextrw $3, %xmm2, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %eax -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %eax, %r8d -; SSE41-NEXT: subw %cx, %r8w -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: subw %cx, %ax -; SSE41-NEXT: cmovol %edx, %eax -; SSE41-NEXT: pextrw $2, %xmm2, %r8d -; SSE41-NEXT: pextrw $2, %xmm0, %ecx -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %ecx, %r9d -; SSE41-NEXT: subw %r8w, %r9w -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: subw %r8w, %cx -; SSE41-NEXT: cmovol %edx, %ecx -; SSE41-NEXT: movd %xmm2, %r8d -; SSE41-NEXT: movd %xmm0, %r9d -; SSE41-NEXT: xorl %edx, %edx -; SSE41-NEXT: movl %r9d, %r10d -; SSE41-NEXT: subw %r8w, %r10w -; SSE41-NEXT: setns %dl -; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF -; SSE41-NEXT: subw %r8w, %r9w -; SSE41-NEXT: cmovol %edx, %r9d -; SSE41-NEXT: pextrw $1, %xmm2, %r8d -; SSE41-NEXT: pextrw $1, %xmm0, %edx -; SSE41-NEXT: xorl %r10d, %r10d -; SSE41-NEXT: movl %edx, %r11d -; SSE41-NEXT: subw %r8w, %r11w -; SSE41-NEXT: setns %r10b -; SSE41-NEXT: addl $32767, %r10d # imm = 0x7FFF -; SSE41-NEXT: subw %r8w, %dx -; SSE41-NEXT: cmovol %r10d, %edx -; SSE41-NEXT: movd %r9d, %xmm0 -; SSE41-NEXT: pinsrw $1, %edx, %xmm0 -; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrw $3, %eax, %xmm0 -; SSE41-NEXT: pinsrw $4, %edi, %xmm0 -; SSE41-NEXT: pinsrw $5, %ebp, %xmm0 -; SSE41-NEXT: pinsrw $6, %ebx, %xmm0 -; SSE41-NEXT: pinsrw $7, %esi, %xmm0 -; SSE41-NEXT: movd %r12d, %xmm1 -; SSE41-NEXT: pinsrw $1, %r13d, %xmm1 -; SSE41-NEXT: pinsrw $2, %r15d, %xmm1 -; SSE41-NEXT: pinsrw $3, %r14d, %xmm1 -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i16: +; SSE: # %bb.0: +; SSE-NEXT: psubsw %xmm2, %xmm0 +; SSE-NEXT: psubsw %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: vpextrw $3, %xmm0, %r14d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r14d, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r14w -; AVX1-NEXT: cmovol %ecx, %r14d -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: vpextrw $2, %xmm0, %r15d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r15d, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r15w -; AVX1-NEXT: cmovol %ecx, %r15d -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vmovd %xmm0, %r12d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r12d, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r12w -; AVX1-NEXT: cmovol %ecx, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: vpextrw $1, %xmm0, %r13d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r13d, %esi -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r13w -; AVX1-NEXT: cmovol %ecx, %r13d -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edi -; AVX1-NEXT: subw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %ebx -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %ebx, %edi -; AVX1-NEXT: subw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %bx -; AVX1-NEXT: cmovol %ecx, %ebx -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %ebp -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %ebp, %edi -; AVX1-NEXT: subw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %bp -; AVX1-NEXT: cmovol %ecx, %ebp -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %edi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %edi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %di -; AVX1-NEXT: cmovol %ecx, %edi -; AVX1-NEXT: vpextrw $3, %xmm1, %ecx -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: movl %eax, %r8d -; AVX1-NEXT: subw %cx, %r8w -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX1-NEXT: subw %cx, %ax -; AVX1-NEXT: cmovol %edx, %eax -; AVX1-NEXT: vpextrw $2, %xmm1, %r8d -; AVX1-NEXT: vpextrw $2, %xmm0, %ecx -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: movl %ecx, %r9d -; AVX1-NEXT: subw %r8w, %r9w -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX1-NEXT: subw %r8w, %cx -; AVX1-NEXT: cmovol %edx, %ecx -; AVX1-NEXT: vmovd %xmm1, %r8d -; AVX1-NEXT: vmovd %xmm0, %r9d -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: movl %r9d, %r10d -; AVX1-NEXT: subw %r8w, %r10w -; AVX1-NEXT: setns %dl -; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX1-NEXT: subw %r8w, %r9w -; AVX1-NEXT: cmovol %edx, %r9d -; AVX1-NEXT: vpextrw $1, %xmm1, %r8d -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: xorl %r10d, %r10d -; AVX1-NEXT: movl %edx, %r11d -; AVX1-NEXT: subw %r8w, %r11w -; AVX1-NEXT: setns %r10b -; AVX1-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX1-NEXT: subw %r8w, %dx -; AVX1-NEXT: cmovol %r10d, %edx -; AVX1-NEXT: vmovd %r9d, %xmm0 -; AVX1-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %r12d, %xmm1 -; AVX1-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: vpextrw $7, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NEXT: vpextrw $3, %xmm0, %r14d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r14d, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r14w -; AVX2-NEXT: cmovol %ecx, %r14d -; AVX2-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-NEXT: vpextrw $2, %xmm0, %r15d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r15d, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r15w -; AVX2-NEXT: cmovol %ecx, %r15d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: vmovd %xmm0, %r12d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r12d, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r12w -; AVX2-NEXT: cmovol %ecx, %r12d -; AVX2-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-NEXT: vpextrw $1, %xmm0, %r13d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r13d, %esi -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r13w -; AVX2-NEXT: cmovol %ecx, %r13d -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: subw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %ebx -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %ebx, %edi -; AVX2-NEXT: subw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %bx -; AVX2-NEXT: cmovol %ecx, %ebx -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %ebp -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %ebp, %edi -; AVX2-NEXT: subw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %bp -; AVX2-NEXT: cmovol %ecx, %ebp -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %edi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %edi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %di -; AVX2-NEXT: cmovol %ecx, %edi -; AVX2-NEXT: vpextrw $3, %xmm1, %ecx -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: movl %eax, %r8d -; AVX2-NEXT: subw %cx, %r8w -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX2-NEXT: subw %cx, %ax -; AVX2-NEXT: cmovol %edx, %eax -; AVX2-NEXT: vpextrw $2, %xmm1, %r8d -; AVX2-NEXT: vpextrw $2, %xmm0, %ecx -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: subw %r8w, %r9w -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX2-NEXT: subw %r8w, %cx -; AVX2-NEXT: cmovol %edx, %ecx -; AVX2-NEXT: vmovd %xmm1, %r8d -; AVX2-NEXT: vmovd %xmm0, %r9d -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: movl %r9d, %r10d -; AVX2-NEXT: subw %r8w, %r10w -; AVX2-NEXT: setns %dl -; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX2-NEXT: subw %r8w, %r9w -; AVX2-NEXT: cmovol %edx, %r9d -; AVX2-NEXT: vpextrw $1, %xmm1, %r8d -; AVX2-NEXT: vpextrw $1, %xmm0, %edx -; AVX2-NEXT: xorl %r10d, %r10d -; AVX2-NEXT: movl %edx, %r11d -; AVX2-NEXT: subw %r8w, %r11w -; AVX2-NEXT: setns %r10b -; AVX2-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX2-NEXT: subw %r8w, %dx -; AVX2-NEXT: cmovol %r10d, %edx -; AVX2-NEXT: vmovd %r9d, %xmm0 -; AVX2-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %r12d, %xmm1 -; AVX2-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpextrw $7, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpextrw $3, %xmm0, %r14d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r14d, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r14w -; AVX512-NEXT: cmovol %ecx, %r14d -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: vpextrw $2, %xmm0, %r15d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r15d, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r15w -; AVX512-NEXT: cmovol %ecx, %r15d -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vmovd %xmm0, %r12d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r12d, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r12w -; AVX512-NEXT: cmovol %ecx, %r12d -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: vpextrw $1, %xmm0, %r13d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r13d, %esi -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r13w -; AVX512-NEXT: cmovol %ecx, %r13d -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpextrw $7, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: subw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %ebx -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %ebx, %edi -; AVX512-NEXT: subw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %bx -; AVX512-NEXT: cmovol %ecx, %ebx -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %ebp -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %ebp, %edi -; AVX512-NEXT: subw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %bp -; AVX512-NEXT: cmovol %ecx, %ebp -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %edi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %di -; AVX512-NEXT: cmovol %ecx, %edi -; AVX512-NEXT: vpextrw $3, %xmm1, %ecx -; AVX512-NEXT: vpextrw $3, %xmm0, %eax -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: movl %eax, %r8d -; AVX512-NEXT: subw %cx, %r8w -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX512-NEXT: subw %cx, %ax -; AVX512-NEXT: cmovol %edx, %eax -; AVX512-NEXT: vpextrw $2, %xmm1, %r8d -; AVX512-NEXT: vpextrw $2, %xmm0, %ecx -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: subw %r8w, %r9w -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX512-NEXT: subw %r8w, %cx -; AVX512-NEXT: cmovol %edx, %ecx -; AVX512-NEXT: vmovd %xmm1, %r8d -; AVX512-NEXT: vmovd %xmm0, %r9d -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: movl %r9d, %r10d -; AVX512-NEXT: subw %r8w, %r10w -; AVX512-NEXT: setns %dl -; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX512-NEXT: subw %r8w, %r9w -; AVX512-NEXT: cmovol %edx, %r9d -; AVX512-NEXT: vpextrw $1, %xmm1, %r8d -; AVX512-NEXT: vpextrw $1, %xmm0, %edx -; AVX512-NEXT: xorl %r10d, %r10d -; AVX512-NEXT: movl %edx, %r11d -; AVX512-NEXT: subw %r8w, %r11w -; AVX512-NEXT: setns %r10b -; AVX512-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX512-NEXT: subw %r8w, %dx -; AVX512-NEXT: cmovol %r10d, %edx -; AVX512-NEXT: vmovd %r9d, %xmm0 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %r12d, %xmm1 -; AVX512-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y) ret <16 x i16> %z } define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { -; SSE2-LABEL: v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm5, %eax -; SSE2-NEXT: pextrw $1, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm5, %eax -; SSE2-NEXT: pextrw $2, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm5, %eax -; SSE2-NEXT: pextrw $3, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm5, %eax -; SSE2-NEXT: pextrw $4, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $5, %xmm5, %eax -; SSE2-NEXT: pextrw $5, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: pextrw $6, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $7, %xmm5, %eax -; SSE2-NEXT: pextrw $7, %xmm1, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movd %xmm6, %eax -; SSE2-NEXT: movd %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm6, %eax -; SSE2-NEXT: pextrw $1, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm6, %eax -; SSE2-NEXT: pextrw $2, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm6, %eax -; SSE2-NEXT: pextrw $3, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm6, %eax -; SSE2-NEXT: pextrw $4, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $5, %xmm6, %eax -; SSE2-NEXT: pextrw $5, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $6, %xmm6, %eax -; SSE2-NEXT: pextrw $6, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $7, %xmm6, %eax -; SSE2-NEXT: pextrw $7, %xmm2, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movd %xmm7, %eax -; SSE2-NEXT: movd %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $1, %xmm7, %eax -; SSE2-NEXT: pextrw $1, %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $2, %xmm7, %eax -; SSE2-NEXT: pextrw $2, %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $3, %xmm7, %eax -; SSE2-NEXT: pextrw $3, %xmm3, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: pextrw $4, %xmm7, %eax -; SSE2-NEXT: pextrw $4, %xmm3, %ebp -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebp, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %bp -; SSE2-NEXT: cmovol %ecx, %ebp -; SSE2-NEXT: pextrw $5, %xmm7, %eax -; SSE2-NEXT: pextrw $5, %xmm3, %ebx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebx, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %bx -; SSE2-NEXT: cmovol %ecx, %ebx -; SSE2-NEXT: pextrw $6, %xmm7, %eax -; SSE2-NEXT: pextrw $6, %xmm3, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: pextrw $7, %xmm7, %eax -; SSE2-NEXT: pextrw $7, %xmm3, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: movd %xmm0, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: pextrw $1, %xmm4, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r8d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r8d, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r8w -; SSE2-NEXT: cmovol %ecx, %r8d -; SSE2-NEXT: pextrw $2, %xmm4, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %edi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %di -; SSE2-NEXT: cmovol %ecx, %edi -; SSE2-NEXT: pextrw $3, %xmm4, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %esi -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %esi, %edx -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: cmovol %ecx, %esi -; SSE2-NEXT: pextrw $4, %xmm4, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %edx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edx, %r13d -; SSE2-NEXT: subw %ax, %r13w -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: cmovol %ecx, %edx -; SSE2-NEXT: pextrw $5, %xmm4, %r13d -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movl %ecx, %r12d -; SSE2-NEXT: subw %r13w, %r12w -; SSE2-NEXT: setns %al -; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSE2-NEXT: subw %r13w, %cx -; SSE2-NEXT: cmovol %eax, %ecx -; SSE2-NEXT: pextrw $6, %xmm4, %r12d -; SSE2-NEXT: pextrw $6, %xmm0, %r13d -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movl %r13d, %r15d -; SSE2-NEXT: subw %r12w, %r15w -; SSE2-NEXT: setns %al -; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSE2-NEXT: subw %r12w, %r13w -; SSE2-NEXT: cmovol %eax, %r13d -; SSE2-NEXT: pextrw $7, %xmm4, %r15d -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: xorl %r12d, %r12d -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: subw %r15w, %r14w -; SSE2-NEXT: setns %r12b -; SSE2-NEXT: addl $32767, %r12d # imm = 0x7FFF -; SSE2-NEXT: subw %r15w, %ax -; SSE2-NEXT: cmovol %r12d, %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: movd %r13d, %xmm12 -; SSE2-NEXT: movd %ecx, %xmm8 -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm13 -; SSE2-NEXT: movd %r9d, %xmm5 -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE2-NEXT: movd %r10d, %xmm11 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE2-NEXT: movd %r11d, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE2-NEXT: movd %ebx, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movd %ebp, %xmm15 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: addq $8, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v32i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: pushq %rax -; SSSE3-NEXT: movd %xmm5, %eax -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm5, %eax -; SSSE3-NEXT: pextrw $1, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm5, %eax -; SSSE3-NEXT: pextrw $2, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm5, %eax -; SSSE3-NEXT: pextrw $3, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm5, %eax -; SSSE3-NEXT: pextrw $4, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $5, %xmm5, %eax -; SSSE3-NEXT: pextrw $5, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $6, %xmm5, %eax -; SSSE3-NEXT: pextrw $6, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $7, %xmm5, %eax -; SSSE3-NEXT: pextrw $7, %xmm1, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movd %xmm6, %eax -; SSSE3-NEXT: movd %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm6, %eax -; SSSE3-NEXT: pextrw $1, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm6, %eax -; SSSE3-NEXT: pextrw $2, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm6, %eax -; SSSE3-NEXT: pextrw $3, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm6, %eax -; SSSE3-NEXT: pextrw $4, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $5, %xmm6, %eax -; SSSE3-NEXT: pextrw $5, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $6, %xmm6, %eax -; SSSE3-NEXT: pextrw $6, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $7, %xmm6, %eax -; SSSE3-NEXT: pextrw $7, %xmm2, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: movd %xmm7, %eax -; SSSE3-NEXT: movd %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $1, %xmm7, %eax -; SSSE3-NEXT: pextrw $1, %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $2, %xmm7, %eax -; SSSE3-NEXT: pextrw $2, %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $3, %xmm7, %eax -; SSSE3-NEXT: pextrw $3, %xmm3, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSSE3-NEXT: pextrw $4, %xmm7, %eax -; SSSE3-NEXT: pextrw $4, %xmm3, %ebp -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebp, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %bp -; SSSE3-NEXT: cmovol %ecx, %ebp -; SSSE3-NEXT: pextrw $5, %xmm7, %eax -; SSSE3-NEXT: pextrw $5, %xmm3, %ebx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebx, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %bx -; SSSE3-NEXT: cmovol %ecx, %ebx -; SSSE3-NEXT: pextrw $6, %xmm7, %eax -; SSSE3-NEXT: pextrw $6, %xmm3, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: pextrw $7, %xmm7, %eax -; SSSE3-NEXT: pextrw $7, %xmm3, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: movd %xmm4, %eax -; SSSE3-NEXT: movd %xmm0, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: pextrw $1, %xmm4, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r8d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r8d, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r8w -; SSSE3-NEXT: cmovol %ecx, %r8d -; SSSE3-NEXT: pextrw $2, %xmm4, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %edi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %di -; SSSE3-NEXT: cmovol %ecx, %edi -; SSSE3-NEXT: pextrw $3, %xmm4, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %esi -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %esi, %edx -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: cmovol %ecx, %esi -; SSSE3-NEXT: pextrw $4, %xmm4, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %edx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edx, %r13d -; SSSE3-NEXT: subw %ax, %r13w -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: cmovol %ecx, %edx -; SSSE3-NEXT: pextrw $5, %xmm4, %r13d -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movl %ecx, %r12d -; SSSE3-NEXT: subw %r13w, %r12w -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSSE3-NEXT: subw %r13w, %cx -; SSSE3-NEXT: cmovol %eax, %ecx -; SSSE3-NEXT: pextrw $6, %xmm4, %r12d -; SSSE3-NEXT: pextrw $6, %xmm0, %r13d -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movl %r13d, %r15d -; SSSE3-NEXT: subw %r12w, %r15w -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF -; SSSE3-NEXT: subw %r12w, %r13w -; SSSE3-NEXT: cmovol %eax, %r13d -; SSSE3-NEXT: pextrw $7, %xmm4, %r15d -; SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSSE3-NEXT: xorl %r12d, %r12d -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: subw %r15w, %r14w -; SSSE3-NEXT: setns %r12b -; SSSE3-NEXT: addl $32767, %r12d # imm = 0x7FFF -; SSSE3-NEXT: subw %r15w, %ax -; SSSE3-NEXT: cmovol %r12d, %eax -; SSSE3-NEXT: movd %eax, %xmm10 -; SSSE3-NEXT: movd %r13d, %xmm12 -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm13 -; SSSE3-NEXT: movd %r9d, %xmm5 -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSSE3-NEXT: movd %r10d, %xmm11 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSSE3-NEXT: movd %r11d, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSSE3-NEXT: movd %ebx, %xmm14 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSSE3-NEXT: movd %ebp, %xmm15 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: addq $8, %rsp -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrw $7, %xmm5, %eax -; SSE41-NEXT: pextrw $7, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm5, %eax -; SSE41-NEXT: pextrw $6, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm5, %eax -; SSE41-NEXT: pextrw $5, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm5, %eax -; SSE41-NEXT: pextrw $4, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm5, %eax -; SSE41-NEXT: pextrw $3, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $2, %xmm5, %eax -; SSE41-NEXT: pextrw $2, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movd %xmm5, %eax -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $1, %xmm5, %eax -; SSE41-NEXT: pextrw $1, %xmm1, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $7, %xmm6, %eax -; SSE41-NEXT: pextrw $7, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm6, %eax -; SSE41-NEXT: pextrw $6, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm6, %eax -; SSE41-NEXT: pextrw $5, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm6, %eax -; SSE41-NEXT: pextrw $4, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm6, %eax -; SSE41-NEXT: pextrw $3, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $2, %xmm6, %eax -; SSE41-NEXT: pextrw $2, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: movd %xmm6, %eax -; SSE41-NEXT: movd %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $1, %xmm6, %eax -; SSE41-NEXT: pextrw $1, %xmm2, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $7, %xmm7, %eax -; SSE41-NEXT: pextrw $7, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $6, %xmm7, %eax -; SSE41-NEXT: pextrw $6, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $5, %xmm7, %eax -; SSE41-NEXT: pextrw $5, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $4, %xmm7, %eax -; SSE41-NEXT: pextrw $4, %xmm3, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: pextrw $3, %xmm7, %eax -; SSE41-NEXT: pextrw $3, %xmm3, %ebx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %bx -; SSE41-NEXT: cmovol %ecx, %ebx -; SSE41-NEXT: pextrw $2, %xmm7, %eax -; SSE41-NEXT: pextrw $2, %xmm3, %r11d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r11d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r11w -; SSE41-NEXT: cmovol %ecx, %r11d -; SSE41-NEXT: movd %xmm7, %eax -; SSE41-NEXT: movd %xmm3, %r10d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r10d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r10w -; SSE41-NEXT: cmovol %ecx, %r10d -; SSE41-NEXT: pextrw $1, %xmm7, %eax -; SSE41-NEXT: pextrw $1, %xmm3, %r9d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r9d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r9w -; SSE41-NEXT: cmovol %ecx, %r9d -; SSE41-NEXT: pextrw $7, %xmm4, %eax -; SSE41-NEXT: pextrw $7, %xmm0, %r8d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r8d, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r8w -; SSE41-NEXT: cmovol %ecx, %r8d -; SSE41-NEXT: pextrw $6, %xmm4, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %edi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: cmovol %ecx, %edi -; SSE41-NEXT: pextrw $5, %xmm4, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %esi -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %esi, %edx -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %ecx, %esi -; SSE41-NEXT: pextrw $4, %xmm4, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %edx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edx, %r13d -; SSE41-NEXT: subw %ax, %r13w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: cmovol %ecx, %edx -; SSE41-NEXT: pextrw $3, %xmm4, %eax -; SSE41-NEXT: pextrw $3, %xmm0, %r13d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r13d, %r12d -; SSE41-NEXT: subw %ax, %r12w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r13w -; SSE41-NEXT: cmovol %ecx, %r13d -; SSE41-NEXT: pextrw $2, %xmm4, %r12d -; SSE41-NEXT: pextrw $2, %xmm0, %eax -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: subw %r12w, %r15w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %r12w, %ax -; SSE41-NEXT: cmovol %ecx, %eax -; SSE41-NEXT: movd %xmm4, %r15d -; SSE41-NEXT: movd %xmm0, %r12d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r12d, %r14d -; SSE41-NEXT: subw %r15w, %r14w -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %r15w, %r12w -; SSE41-NEXT: cmovol %ecx, %r12d -; SSE41-NEXT: pextrw $1, %xmm4, %r14d -; SSE41-NEXT: pextrw $1, %xmm0, %ecx -; SSE41-NEXT: xorl %r15d, %r15d -; SSE41-NEXT: movl %ecx, %ebp -; SSE41-NEXT: subw %r14w, %bp -; SSE41-NEXT: setns %r15b -; SSE41-NEXT: addl $32767, %r15d # imm = 0x7FFF -; SSE41-NEXT: subw %r14w, %cx -; SSE41-NEXT: cmovol %r15d, %ecx -; SSE41-NEXT: movd %r12d, %xmm0 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 -; SSE41-NEXT: pinsrw $3, %r13d, %xmm0 -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pinsrw $5, %esi, %xmm0 -; SSE41-NEXT: pinsrw $6, %edi, %xmm0 -; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 -; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; SSE41-NEXT: movd %r10d, %xmm3 -; SSE41-NEXT: pinsrw $1, %r9d, %xmm3 -; SSE41-NEXT: pinsrw $2, %r11d, %xmm3 -; SSE41-NEXT: pinsrw $3, %ebx, %xmm3 -; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v32i16: +; SSE: # %bb.0: +; SSE-NEXT: psubsw %xmm4, %xmm0 +; SSE-NEXT: psubsw %xmm5, %xmm1 +; SSE-NEXT: psubsw %xmm6, %xmm2 +; SSE-NEXT: psubsw %xmm7, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrw $7, %xmm3, %eax -; AVX1-NEXT: vpextrw $7, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm3, %eax -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm3, %eax -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm3, %eax -; AVX1-NEXT: vpextrw $4, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm3, %eax -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm3, %eax -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vmovd %xmm3, %eax -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $1, %xmm3, %eax -; AVX1-NEXT: vpextrw $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm3, %eax -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrw $7, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm3, %eax -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm3, %eax -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm3, %eax -; AVX1-NEXT: vpextrw $4, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm3, %eax -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm3, %eax -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vmovd %xmm3, %eax -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $1, %xmm3, %eax -; AVX1-NEXT: vpextrw $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $7, %xmm2, %eax -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm2, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm2, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $4, %xmm2, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm2, %eax -; AVX1-NEXT: vpextrw $3, %xmm0, %ebx -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %bx -; AVX1-NEXT: cmovol %ecx, %ebx -; AVX1-NEXT: vpextrw $2, %xmm2, %eax -; AVX1-NEXT: vpextrw $2, %xmm0, %r11d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r11d, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r11w -; AVX1-NEXT: cmovol %ecx, %r11d -; AVX1-NEXT: vmovd %xmm2, %eax -; AVX1-NEXT: vmovd %xmm0, %r10d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r10d, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r10w -; AVX1-NEXT: cmovol %ecx, %r10d -; AVX1-NEXT: vpextrw $1, %xmm2, %eax -; AVX1-NEXT: vpextrw $1, %xmm0, %r9d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r9d, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r9w -; AVX1-NEXT: cmovol %ecx, %r9d -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vpextrw $7, %xmm0, %r8d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r8d, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r8w -; AVX1-NEXT: cmovol %ecx, %r8d -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: vpextrw $6, %xmm0, %edi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %edi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %di -; AVX1-NEXT: cmovol %ecx, %edi -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: vpextrw $5, %xmm0, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edx -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpextrw $4, %xmm0, %edx -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %edx, %r13d -; AVX1-NEXT: subw %ax, %r13w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %dx -; AVX1-NEXT: cmovol %ecx, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: vpextrw $3, %xmm0, %r13d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r13d, %r12d -; AVX1-NEXT: subw %ax, %r12w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r13w -; AVX1-NEXT: cmovol %ecx, %r13d -; AVX1-NEXT: vpextrw $2, %xmm1, %r12d -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: subw %r12w, %r15w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %r12w, %ax -; AVX1-NEXT: cmovol %ecx, %eax -; AVX1-NEXT: vmovd %xmm1, %r15d -; AVX1-NEXT: vmovd %xmm0, %r12d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r12d, %r14d -; AVX1-NEXT: subw %r15w, %r14w -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %r15w, %r12w -; AVX1-NEXT: cmovol %ecx, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %r14d -; AVX1-NEXT: vpextrw $1, %xmm0, %ecx -; AVX1-NEXT: xorl %r15d, %r15d -; AVX1-NEXT: movl %ecx, %ebp -; AVX1-NEXT: subw %r14w, %bp -; AVX1-NEXT: setns %r15b -; AVX1-NEXT: addl $32767, %r15d # imm = 0x7FFF -; AVX1-NEXT: subw %r14w, %cx -; AVX1-NEXT: cmovol %r15d, %ecx -; AVX1-NEXT: vmovd %r12d, %xmm0 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %r10d, %xmm1 -; AVX1-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsubsw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpsubsw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrw $7, %xmm3, %eax -; AVX2-NEXT: vpextrw $7, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm3, %eax -; AVX2-NEXT: vpextrw $6, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm3, %eax -; AVX2-NEXT: vpextrw $5, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm3, %eax -; AVX2-NEXT: vpextrw $4, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm3, %eax -; AVX2-NEXT: vpextrw $3, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $2, %xmm3, %eax -; AVX2-NEXT: vpextrw $2, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vmovd %xmm3, %eax -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $1, %xmm3, %eax -; AVX2-NEXT: vpextrw $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpextrw $7, %xmm3, %eax -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrw $7, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm3, %eax -; AVX2-NEXT: vpextrw $6, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm3, %eax -; AVX2-NEXT: vpextrw $5, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm3, %eax -; AVX2-NEXT: vpextrw $4, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm3, %eax -; AVX2-NEXT: vpextrw $3, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $2, %xmm3, %eax -; AVX2-NEXT: vpextrw $2, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vmovd %xmm3, %eax -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $1, %xmm3, %eax -; AVX2-NEXT: vpextrw $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $7, %xmm2, %eax -; AVX2-NEXT: vpextrw $7, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $6, %xmm2, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $5, %xmm2, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $4, %xmm2, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrw $3, %xmm2, %eax -; AVX2-NEXT: vpextrw $3, %xmm0, %ebx -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %bx -; AVX2-NEXT: cmovol %ecx, %ebx -; AVX2-NEXT: vpextrw $2, %xmm2, %eax -; AVX2-NEXT: vpextrw $2, %xmm0, %r11d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r11d, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r11w -; AVX2-NEXT: cmovol %ecx, %r11d -; AVX2-NEXT: vmovd %xmm2, %eax -; AVX2-NEXT: vmovd %xmm0, %r10d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r10d, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r10w -; AVX2-NEXT: cmovol %ecx, %r10d -; AVX2-NEXT: vpextrw $1, %xmm2, %eax -; AVX2-NEXT: vpextrw $1, %xmm0, %r9d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r9d, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r9w -; AVX2-NEXT: cmovol %ecx, %r9d -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: vpextrw $7, %xmm0, %r8d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r8d, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r8w -; AVX2-NEXT: cmovol %ecx, %r8d -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: vpextrw $6, %xmm0, %edi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %edi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %di -; AVX2-NEXT: cmovol %ecx, %edi -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: vpextrw $5, %xmm0, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edx -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: vpextrw $4, %xmm0, %edx -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %edx, %r13d -; AVX2-NEXT: subw %ax, %r13w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %dx -; AVX2-NEXT: cmovol %ecx, %edx -; AVX2-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NEXT: vpextrw $3, %xmm0, %r13d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r13d, %r12d -; AVX2-NEXT: subw %ax, %r12w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r13w -; AVX2-NEXT: cmovol %ecx, %r13d -; AVX2-NEXT: vpextrw $2, %xmm1, %r12d -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: subw %r12w, %r15w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %r12w, %ax -; AVX2-NEXT: cmovol %ecx, %eax -; AVX2-NEXT: vmovd %xmm1, %r15d -; AVX2-NEXT: vmovd %xmm0, %r12d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r12d, %r14d -; AVX2-NEXT: subw %r15w, %r14w -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %r15w, %r12w -; AVX2-NEXT: cmovol %ecx, %r12d -; AVX2-NEXT: vpextrw $1, %xmm1, %r14d -; AVX2-NEXT: vpextrw $1, %xmm0, %ecx -; AVX2-NEXT: xorl %r15d, %r15d -; AVX2-NEXT: movl %ecx, %ebp -; AVX2-NEXT: subw %r14w, %bp -; AVX2-NEXT: setns %r15b -; AVX2-NEXT: addl $32767, %r15d # imm = 0x7FFF -; AVX2-NEXT: subw %r14w, %cx -; AVX2-NEXT: cmovol %r15d, %ecx -; AVX2-NEXT: vmovd %r12d, %xmm0 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %r10d, %xmm1 -; AVX2-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: # xmm2 = mem[0],zero,zero,zero -; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: # xmm3 = mem[0],zero,zero,zero -; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpextrw $7, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpextrw $3, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: vpextrw $2, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: vpextrw $1, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrw $7, %xmm2, %eax -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrw $7, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm2, %eax -; AVX512-NEXT: vpextrw $6, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm2, %eax -; AVX512-NEXT: vpextrw $5, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm2, %eax -; AVX512-NEXT: vpextrw $4, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm2, %eax -; AVX512-NEXT: vpextrw $3, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $2, %xmm2, %eax -; AVX512-NEXT: vpextrw $2, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $1, %xmm2, %eax -; AVX512-NEXT: vpextrw $1, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vpextrw $7, %xmm2, %eax -; AVX512-NEXT: vpextrw $7, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $6, %xmm2, %eax -; AVX512-NEXT: vpextrw $6, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $5, %xmm2, %eax -; AVX512-NEXT: vpextrw $5, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $4, %xmm2, %eax -; AVX512-NEXT: vpextrw $4, %xmm3, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrw $3, %xmm2, %eax -; AVX512-NEXT: vpextrw $3, %xmm3, %ebx -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %bx -; AVX512-NEXT: cmovol %ecx, %ebx -; AVX512-NEXT: vpextrw $2, %xmm2, %eax -; AVX512-NEXT: vpextrw $2, %xmm3, %r11d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r11d, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r11w -; AVX512-NEXT: cmovol %ecx, %r11d -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vmovd %xmm3, %r10d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r10d, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r10w -; AVX512-NEXT: cmovol %ecx, %r10d -; AVX512-NEXT: vpextrw $1, %xmm2, %eax -; AVX512-NEXT: vpextrw $1, %xmm3, %r9d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r9d, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r9w -; AVX512-NEXT: cmovol %ecx, %r9d -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpextrw $7, %xmm0, %r8d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r8d, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r8w -; AVX512-NEXT: cmovol %ecx, %r8d -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpextrw $6, %xmm0, %edi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %edi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %di -; AVX512-NEXT: cmovol %ecx, %edi -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpextrw $5, %xmm0, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edx -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpextrw $4, %xmm0, %edx -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %edx, %r13d -; AVX512-NEXT: subw %ax, %r13w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %dx -; AVX512-NEXT: cmovol %ecx, %edx -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpextrw $3, %xmm0, %r13d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r13d, %r12d -; AVX512-NEXT: subw %ax, %r12w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r13w -; AVX512-NEXT: cmovol %ecx, %r13d -; AVX512-NEXT: vpextrw $2, %xmm1, %r12d -; AVX512-NEXT: vpextrw $2, %xmm0, %eax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: subw %r12w, %r15w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %r12w, %ax -; AVX512-NEXT: cmovol %ecx, %eax -; AVX512-NEXT: vmovd %xmm1, %r15d -; AVX512-NEXT: vmovd %xmm0, %r12d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r12d, %r14d -; AVX512-NEXT: subw %r15w, %r14w -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %r15w, %r12w -; AVX512-NEXT: cmovol %ecx, %r12d -; AVX512-NEXT: vpextrw $1, %xmm1, %r14d -; AVX512-NEXT: vpextrw $1, %xmm0, %ecx -; AVX512-NEXT: xorl %r15d, %r15d -; AVX512-NEXT: movl %ecx, %ebp -; AVX512-NEXT: subw %r14w, %bp -; AVX512-NEXT: setns %r15b -; AVX512-NEXT: addl $32767, %r15d # imm = 0x7FFF -; AVX512-NEXT: subw %r14w, %cx -; AVX512-NEXT: cmovol %r15d, %ecx -; AVX512-NEXT: vmovd %r12d, %xmm0 -; AVX512-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %r10d, %xmm1 -; AVX512-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: # xmm3 = mem[0],zero,zero,zero -; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z @@ -13489,940 +196,76 @@ ; Too narrow vectors, legalized by widening. define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { -; SSE2-LABEL: v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: movd %xmm0, %r8d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r8d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r8w -; SSE2-NEXT: cmovol %ecx, %r8d -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: pextrw $4, %xmm0, %r14d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r14d, %edi -; SSE2-NEXT: subw %cx, %di -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: subw %cx, %r14w -; SSE2-NEXT: cmovol %esi, %r14d -; SSE2-NEXT: pextrw $5, %xmm1, %esi -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %ecx, %ebx -; SSE2-NEXT: subw %si, %bx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE2-NEXT: subw %si, %cx -; SSE2-NEXT: cmovol %edi, %ecx -; SSE2-NEXT: pextrw $6, %xmm1, %edi -; SSE2-NEXT: pextrw $6, %xmm0, %esi -; SSE2-NEXT: xorl %ebx, %ebx -; SSE2-NEXT: movl %esi, %ebp -; SSE2-NEXT: subw %di, %bp -; SSE2-NEXT: setns %bl -; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE2-NEXT: subw %di, %si -; SSE2-NEXT: cmovol %ebx, %esi -; SSE2-NEXT: pextrw $7, %xmm1, %edi -; SSE2-NEXT: pextrw $7, %xmm0, %ebx -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subw %di, %ax -; SSE2-NEXT: setns %bpl -; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE2-NEXT: subw %di, %bx -; SSE2-NEXT: cmovol %ebp, %ebx -; SSE2-NEXT: movd %ebx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %r11d, %xmm0 -; SSE2-NEXT: movd %r10d, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 -; SSE2-NEXT: movq %xmm3, (%rdx) -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: movd %xmm0, %r8d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r8d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r8w -; SSSE3-NEXT: cmovol %ecx, %r8d -; SSSE3-NEXT: pextrw $1, %xmm1, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: pextrw $2, %xmm1, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: pextrw $3, %xmm1, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: pextrw $4, %xmm1, %ecx -; SSSE3-NEXT: pextrw $4, %xmm0, %r14d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r14d, %edi -; SSSE3-NEXT: subw %cx, %di -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: subw %cx, %r14w -; SSSE3-NEXT: cmovol %esi, %r14d -; SSSE3-NEXT: pextrw $5, %xmm1, %esi -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %ecx, %ebx -; SSSE3-NEXT: subw %si, %bx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSSE3-NEXT: subw %si, %cx -; SSSE3-NEXT: cmovol %edi, %ecx -; SSSE3-NEXT: pextrw $6, %xmm1, %edi -; SSSE3-NEXT: pextrw $6, %xmm0, %esi -; SSSE3-NEXT: xorl %ebx, %ebx -; SSSE3-NEXT: movl %esi, %ebp -; SSSE3-NEXT: subw %di, %bp -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSSE3-NEXT: subw %di, %si -; SSSE3-NEXT: cmovol %ebx, %esi -; SSSE3-NEXT: pextrw $7, %xmm1, %edi -; SSSE3-NEXT: pextrw $7, %xmm0, %ebx -; SSSE3-NEXT: xorl %ebp, %ebp -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subw %di, %ax -; SSSE3-NEXT: setns %bpl -; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSSE3-NEXT: subw %di, %bx -; SSSE3-NEXT: cmovol %ebp, %ebx -; SSSE3-NEXT: movd %ebx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movd %r14d, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %r11d, %xmm0 -; SSSE3-NEXT: movd %r10d, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: packuswb %xmm0, %xmm3 -; SSSE3-NEXT: movq %xmm3, (%rdx) -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE41-NEXT: pextrw $7, %xmm1, %eax -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE41-NEXT: pextrw $7, %xmm0, %r8d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r8d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r8w -; SSE41-NEXT: cmovol %ecx, %r8d -; SSE41-NEXT: pextrw $6, %xmm1, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %r9d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r9d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r9w -; SSE41-NEXT: cmovol %ecx, %r9d -; SSE41-NEXT: pextrw $5, %xmm1, %eax -; SSE41-NEXT: pextrw $5, %xmm0, %r10d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r10d, %edi -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r10w -; SSE41-NEXT: cmovol %ecx, %r10d -; SSE41-NEXT: pextrw $4, %xmm1, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %r11d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r11d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r11w -; SSE41-NEXT: cmovol %ecx, %r11d -; SSE41-NEXT: pextrw $3, %xmm1, %ecx -; SSE41-NEXT: pextrw $3, %xmm0, %r14d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r14d, %edi -; SSE41-NEXT: subw %cx, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: subw %cx, %r14w -; SSE41-NEXT: cmovol %esi, %r14d -; SSE41-NEXT: pextrw $2, %xmm1, %esi -; SSE41-NEXT: pextrw $2, %xmm0, %ecx -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %ecx, %ebx -; SSE41-NEXT: subw %si, %bx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: subw %si, %cx -; SSE41-NEXT: cmovol %edi, %ecx -; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: movd %xmm0, %edi -; SSE41-NEXT: xorl %ebx, %ebx -; SSE41-NEXT: movl %edi, %ebp -; SSE41-NEXT: subw %si, %bp -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE41-NEXT: subw %si, %di -; SSE41-NEXT: cmovol %ebx, %edi -; SSE41-NEXT: pextrw $1, %xmm1, %esi -; SSE41-NEXT: pextrw $1, %xmm0, %ebx -; SSE41-NEXT: xorl %ebp, %ebp -; SSE41-NEXT: movl %ebx, %eax -; SSE41-NEXT: subw %si, %ax -; SSE41-NEXT: setns %bpl -; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE41-NEXT: subw %si, %bx -; SSE41-NEXT: cmovol %ebp, %ebx -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pinsrw $1, %ebx, %xmm0 -; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrw $3, %r14d, %xmm0 -; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 -; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 -; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 -; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdx) -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: psubsb %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpextrw $7, %xmm1, %r8d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r8d, %esi -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r8w -; AVX1-NEXT: cmovol %ecx, %r8d -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: vpextrw $6, %xmm1, %r9d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r9d, %esi -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r9w -; AVX1-NEXT: cmovol %ecx, %r9d -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: vpextrw $5, %xmm1, %r10d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r10d, %edi -; AVX1-NEXT: subw %ax, %di -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r10w -; AVX1-NEXT: cmovol %ecx, %r10d -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: vpextrw $4, %xmm1, %r11d -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %r11d, %esi -; AVX1-NEXT: subw %ax, %si -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX1-NEXT: subw %ax, %r11w -; AVX1-NEXT: cmovol %ecx, %r11d -; AVX1-NEXT: vpextrw $3, %xmm0, %ecx -; AVX1-NEXT: vpextrw $3, %xmm1, %r14d -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movl %r14d, %edi -; AVX1-NEXT: subw %cx, %di -; AVX1-NEXT: setns %sil -; AVX1-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX1-NEXT: subw %cx, %r14w -; AVX1-NEXT: cmovol %esi, %r14d -; AVX1-NEXT: vpextrw $2, %xmm0, %esi -; AVX1-NEXT: vpextrw $2, %xmm1, %ecx -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movl %ecx, %ebx -; AVX1-NEXT: subw %si, %bx -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX1-NEXT: subw %si, %cx -; AVX1-NEXT: cmovol %edi, %ecx -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: vmovd %xmm1, %edi -; AVX1-NEXT: xorl %ebx, %ebx -; AVX1-NEXT: movl %edi, %ebp -; AVX1-NEXT: subw %si, %bp -; AVX1-NEXT: setns %bl -; AVX1-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX1-NEXT: subw %si, %di -; AVX1-NEXT: cmovol %ebx, %edi -; AVX1-NEXT: vpextrw $1, %xmm0, %esi -; AVX1-NEXT: vpextrw $1, %xmm1, %ebx -; AVX1-NEXT: xorl %ebp, %ebp -; AVX1-NEXT: movl %ebx, %eax -; AVX1-NEXT: subw %si, %ax -; AVX1-NEXT: setns %bpl -; AVX1-NEXT: addl $32767, %ebp # imm = 0x7FFF -; AVX1-NEXT: subw %si, %bx -; AVX1-NEXT: cmovol %ebp, %ebx -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-NEXT: vpextrw $7, %xmm1, %r8d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r8d, %esi -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r8w -; AVX2-NEXT: cmovol %ecx, %r8d -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: vpextrw $6, %xmm1, %r9d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r9d, %esi -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r9w -; AVX2-NEXT: cmovol %ecx, %r9d -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: vpextrw $5, %xmm1, %r10d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r10d, %edi -; AVX2-NEXT: subw %ax, %di -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r10w -; AVX2-NEXT: cmovol %ecx, %r10d -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: vpextrw $4, %xmm1, %r11d -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %r11d, %esi -; AVX2-NEXT: subw %ax, %si -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX2-NEXT: subw %ax, %r11w -; AVX2-NEXT: cmovol %ecx, %r11d -; AVX2-NEXT: vpextrw $3, %xmm0, %ecx -; AVX2-NEXT: vpextrw $3, %xmm1, %r14d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl %r14d, %edi -; AVX2-NEXT: subw %cx, %di -; AVX2-NEXT: setns %sil -; AVX2-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX2-NEXT: subw %cx, %r14w -; AVX2-NEXT: cmovol %esi, %r14d -; AVX2-NEXT: vpextrw $2, %xmm0, %esi -; AVX2-NEXT: vpextrw $2, %xmm1, %ecx -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movl %ecx, %ebx -; AVX2-NEXT: subw %si, %bx -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX2-NEXT: subw %si, %cx -; AVX2-NEXT: cmovol %edi, %ecx -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: vmovd %xmm1, %edi -; AVX2-NEXT: xorl %ebx, %ebx -; AVX2-NEXT: movl %edi, %ebp -; AVX2-NEXT: subw %si, %bp -; AVX2-NEXT: setns %bl -; AVX2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX2-NEXT: subw %si, %di -; AVX2-NEXT: cmovol %ebx, %edi -; AVX2-NEXT: vpextrw $1, %xmm0, %esi -; AVX2-NEXT: vpextrw $1, %xmm1, %ebx -; AVX2-NEXT: xorl %ebp, %ebp -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: subw %si, %ax -; AVX2-NEXT: setns %bpl -; AVX2-NEXT: addl $32767, %ebp # imm = 0x7FFF -; AVX2-NEXT: subw %si, %bx -; AVX2-NEXT: cmovol %ebp, %ebx -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdx) -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: retq -; -; AVX512-LABEL: v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpextrw $7, %xmm0, %eax -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpextrw $7, %xmm1, %r8d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r8d, %esi -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r8w -; AVX512-NEXT: cmovol %ecx, %r8d -; AVX512-NEXT: vpextrw $6, %xmm0, %eax -; AVX512-NEXT: vpextrw $6, %xmm1, %r9d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r9d, %esi -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r9w -; AVX512-NEXT: cmovol %ecx, %r9d -; AVX512-NEXT: vpextrw $5, %xmm0, %eax -; AVX512-NEXT: vpextrw $5, %xmm1, %r10d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r10d, %edi -; AVX512-NEXT: subw %ax, %di -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r10w -; AVX512-NEXT: cmovol %ecx, %r10d -; AVX512-NEXT: vpextrw $4, %xmm0, %eax -; AVX512-NEXT: vpextrw $4, %xmm1, %r11d -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %r11d, %esi -; AVX512-NEXT: subw %ax, %si -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX512-NEXT: subw %ax, %r11w -; AVX512-NEXT: cmovol %ecx, %r11d -; AVX512-NEXT: vpextrw $3, %xmm0, %ecx -; AVX512-NEXT: vpextrw $3, %xmm1, %r14d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl %r14d, %edi -; AVX512-NEXT: subw %cx, %di -; AVX512-NEXT: setns %sil -; AVX512-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX512-NEXT: subw %cx, %r14w -; AVX512-NEXT: cmovol %esi, %r14d -; AVX512-NEXT: vpextrw $2, %xmm0, %esi -; AVX512-NEXT: vpextrw $2, %xmm1, %ecx -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movl %ecx, %ebx -; AVX512-NEXT: subw %si, %bx -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX512-NEXT: subw %si, %cx -; AVX512-NEXT: cmovol %edi, %ecx -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: vmovd %xmm1, %edi -; AVX512-NEXT: xorl %ebx, %ebx -; AVX512-NEXT: movl %edi, %ebp -; AVX512-NEXT: subw %si, %bp -; AVX512-NEXT: setns %bl -; AVX512-NEXT: addl $32767, %ebx # imm = 0x7FFF -; AVX512-NEXT: subw %si, %di -; AVX512-NEXT: cmovol %ebx, %edi -; AVX512-NEXT: vpextrw $1, %xmm0, %esi -; AVX512-NEXT: vpextrw $1, %xmm1, %ebx -; AVX512-NEXT: xorl %ebp, %ebp -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: subw %si, %ax -; AVX512-NEXT: setns %bpl -; AVX512-NEXT: addl $32767, %ebp # imm = 0x7FFF -; AVX512-NEXT: subw %si, %bx -; AVX512-NEXT: cmovol %ebp, %ebx -; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpmovwb %xmm0, (%rdx) -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq - %x = load <8 x i8>, <8 x i8>* %px - %y = load <8 x i8>, <8 x i8>* %py - %z = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %x, <8 x i8> %y) - store <8 x i8> %z, <8 x i8>* %pz - ret void -} - -define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { -; SSE2-LABEL: v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $24, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %r8d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r8d, %edi -; SSE2-NEXT: subl %ecx, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %ecx, %r8d -; SSE2-NEXT: cmovol %esi, %r8d -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: movd %xmm0, %r10d -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %r10d, %ecx -; SSE2-NEXT: subl %esi, %ecx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %esi, %r10d -; SSE2-NEXT: cmovol %edi, %r10d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: subl %r9d, %esi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %r9d, %ecx -; SSE2-NEXT: cmovol %edi, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: subl %r9d, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %r9d, %eax -; SSE2-NEXT: cmovol %esi, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrld $24, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm2, %r8d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r8d, %edi -; SSSE3-NEXT: subl %ecx, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %ecx, %r8d -; SSSE3-NEXT: cmovol %esi, %r8d -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: movd %xmm0, %r10d -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %r10d, %ecx -; SSSE3-NEXT: subl %esi, %ecx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %esi, %r10d -; SSSE3-NEXT: cmovol %edi, %r10d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: subl %r9d, %esi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %r9d, %ecx -; SSSE3-NEXT: cmovol %edi, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: subl %r9d, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %r9d, %eax -; SSSE3-NEXT: cmovol %esi, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm0, (%rdx) -; SSSE3-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq ; -; SSE41-LABEL: v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pslld $24, %xmm1 -; SSE41-NEXT: pextrd $3, %xmm1, %ecx -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: pextrd $3, %xmm0, %r8d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r8d, %edi -; SSE41-NEXT: subl %ecx, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %ecx, %r8d -; SSE41-NEXT: cmovol %esi, %r8d -; SSE41-NEXT: pextrd $2, %xmm1, %esi -; SSE41-NEXT: pextrd $2, %xmm0, %r10d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r10d, %ecx -; SSE41-NEXT: subl %esi, %ecx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %esi, %r10d -; SSE41-NEXT: cmovol %edi, %r10d -; SSE41-NEXT: movd %xmm1, %r9d -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: subl %r9d, %esi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %r9d, %ecx -; SSE41-NEXT: cmovol %edi, %ecx -; SSE41-NEXT: pextrd $1, %xmm1, %r9d -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: subl %r9d, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %r9d, %eax -; SSE41-NEXT: cmovol %esi, %eax -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm0, (%rdx) -; SSE41-NEXT: retq +; AVX512-LABEL: v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovwb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <8 x i8>, <8 x i8>* %px + %y = load <8 x i8>, <8 x i8>* %py + %z = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %x, <8 x i8> %y) + store <8 x i8> %z, <8 x i8>* %pz + ret void +} + +define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { +; SSE-LABEL: v4i8: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psubsb %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %ecx -; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $3, %xmm1, %r9d -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movl %r9d, %edi -; AVX1-NEXT: subl %ecx, %edi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %ecx, %r9d -; AVX1-NEXT: cmovol %esi, %r9d -; AVX1-NEXT: vpextrd $2, %xmm0, %r8d -; AVX1-NEXT: vpextrd $2, %xmm1, %r10d -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movl %r10d, %ecx -; AVX1-NEXT: subl %r8d, %ecx -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %r8d, %r10d -; AVX1-NEXT: cmovol %edi, %r10d -; AVX1-NEXT: vmovd %xmm0, %r8d -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: subl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %r8d, %eax -; AVX1-NEXT: cmovol %ecx, %eax -; AVX1-NEXT: vpextrd $1, %xmm0, %r8d -; AVX1-NEXT: vpextrd $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edi -; AVX1-NEXT: subl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %r8d, %esi -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx -; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX2-NEXT: vpextrd $3, %xmm1, %r9d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl %r9d, %edi -; AVX2-NEXT: subl %ecx, %edi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %ecx, %r9d -; AVX2-NEXT: cmovol %esi, %r9d -; AVX2-NEXT: vpextrd $2, %xmm0, %r8d -; AVX2-NEXT: vpextrd $2, %xmm1, %r10d -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movl %r10d, %ecx -; AVX2-NEXT: subl %r8d, %ecx -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %r8d, %r10d -; AVX2-NEXT: cmovol %edi, %r10d -; AVX2-NEXT: vmovd %xmm0, %r8d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: subl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %r8d, %eax -; AVX2-NEXT: cmovol %ecx, %eax -; AVX2-NEXT: vpextrd $1, %xmm0, %r8d -; AVX2-NEXT: vpextrd $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: subl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %r8d, %esi -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $3, %xmm0, %ecx -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $3, %xmm1, %r9d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl %r9d, %edi -; AVX512-NEXT: subl %ecx, %edi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %ecx, %r9d -; AVX512-NEXT: cmovol %esi, %r9d -; AVX512-NEXT: vpextrd $2, %xmm0, %r8d -; AVX512-NEXT: vpextrd $2, %xmm1, %r10d -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movl %r10d, %ecx -; AVX512-NEXT: subl %r8d, %ecx -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %r8d, %r10d -; AVX512-NEXT: cmovol %edi, %r10d -; AVX512-NEXT: vmovd %xmm0, %r8d -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: subl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %r8d, %eax -; AVX512-NEXT: cmovol %ecx, %eax -; AVX512-NEXT: vpextrd $1, %xmm0, %r8d -; AVX512-NEXT: vpextrd $1, %xmm1, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: subl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %r8d, %esi -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vpmovdb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i8>, <4 x i8>* %px @@ -14437,44 +280,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE2-NEXT: movzwl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE2-NEXT: psllq $56, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: psllq $56, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movq %rcx, %rdi -; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE2-NEXT: addq %r8, %rsi -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: cmovoq %rsi, %rcx -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: subq %r9, %rsi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addq %r8, %rdi -; SSE2-NEXT: subq %r9, %rax -; SSE2-NEXT: cmovoq %rdi, %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlq $56, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: psubsb %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdx) ; SSE2-NEXT: retq @@ -14485,131 +293,38 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: movzwl (%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm3, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %rcx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movq %rcx, %rdi -; SSSE3-NEXT: subq %rax, %rdi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSSE3-NEXT: addq %r8, %rsi -; SSSE3-NEXT: subq %rax, %rcx -; SSSE3-NEXT: cmovoq %rsi, %rcx -; SSSE3-NEXT: movq %xmm1, %r9 -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: subq %r9, %rsi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addq %r8, %rdi -; SSSE3-NEXT: subq %r9, %rax -; SSSE3-NEXT: cmovoq %rdi, %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: movq %rcx, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: psubsb %xmm1, %xmm0 ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdx) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psllq $56, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: psllq $56, %xmm0 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movq %rcx, %rdi -; SSE41-NEXT: subq %rax, %rdi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE41-NEXT: addq %r8, %rsi -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: cmovoq %rsi, %rcx -; SSE41-NEXT: pextrq $1, %xmm1, %r9 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: subq %r9, %rsi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addq %r8, %rdi -; SSE41-NEXT: subq %r9, %rax -; SSE41-NEXT: cmovoq %rdi, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pextrw $0, %xmm1, (%rdx) +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzwl (%rsi), %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: psubsb %xmm1, %xmm0 +; SSE41-NEXT: pextrw $0, %xmm0, (%rdx) ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: subq %rax, %rdi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX1-NEXT: addq %r8, %rsi -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: cmovoq %rsi, %rcx -; AVX1-NEXT: vpextrq $1, %xmm1, %r9 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: subq %r9, %rsi -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addq %r8, %rdi -; AVX1-NEXT: subq %r9, %rax -; AVX1-NEXT: cmovoq %rdi, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movzwl (%rsi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movq %rcx, %rdi -; AVX2-NEXT: subq %rax, %rdi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX2-NEXT: addq %r8, %rsi -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: cmovoq %rsi, %rcx -; AVX2-NEXT: vpextrq $1, %xmm1, %r9 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: subq %r9, %rsi -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addq %r8, %rdi -; AVX2-NEXT: subq %r9, %rax -; AVX2-NEXT: cmovoq %rdi, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: movzwl (%rsi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -14619,32 +334,8 @@ ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: movzwl (%rsi), %eax ; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movq %rcx, %rdi -; AVX512-NEXT: subq %rax, %rdi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX512-NEXT: addq %r8, %rsi -; AVX512-NEXT: subq %rax, %rcx -; AVX512-NEXT: cmovoq %rsi, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: subq %r9, %rsi -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addq %r8, %rdi -; AVX512-NEXT: subq %r9, %rax -; AVX512-NEXT: cmovoq %rdi, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vmovq %rcx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmovqb %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i8>, <2 x i8>* %px @@ -14655,514 +346,67 @@ } define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { -; SSE2-LABEL: v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %r8d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r8d, %edi -; SSE2-NEXT: subl %ecx, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %ecx, %r8d -; SSE2-NEXT: cmovol %esi, %r8d -; SSE2-NEXT: movd %xmm1, %esi -; SSE2-NEXT: movd %xmm0, %r10d -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %r10d, %ecx -; SSE2-NEXT: subl %esi, %ecx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %esi, %r10d -; SSE2-NEXT: cmovol %edi, %r10d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: subl %r9d, %esi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %r9d, %ecx -; SSE2-NEXT: cmovol %edi, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %r9d -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: subl %r9d, %edi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE2-NEXT: subl %r9d, %eax -; SSE2-NEXT: cmovol %esi, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: movd %r8d, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v4i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm3, %ecx -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm2, %r8d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r8d, %edi -; SSSE3-NEXT: subl %ecx, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %ecx, %r8d -; SSSE3-NEXT: cmovol %esi, %r8d -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: movd %xmm0, %r10d -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %r10d, %ecx -; SSSE3-NEXT: subl %esi, %ecx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %esi, %r10d -; SSSE3-NEXT: cmovol %edi, %r10d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: subl %r9d, %esi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %r9d, %ecx -; SSSE3-NEXT: cmovol %edi, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %r9d -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: subl %r9d, %edi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSSE3-NEXT: subl %r9d, %eax -; SSSE3-NEXT: cmovol %esi, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: movd %r8d, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero -; SSSE3-NEXT: movq %xmm0, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE41-NEXT: pextrd $3, %xmm1, %ecx -; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE41-NEXT: pextrd $3, %xmm0, %r8d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r8d, %edi -; SSE41-NEXT: subl %ecx, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %ecx, %r8d -; SSE41-NEXT: cmovol %esi, %r8d -; SSE41-NEXT: pextrd $2, %xmm1, %esi -; SSE41-NEXT: pextrd $2, %xmm0, %r10d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r10d, %ecx -; SSE41-NEXT: subl %esi, %ecx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %esi, %r10d -; SSE41-NEXT: cmovol %edi, %r10d -; SSE41-NEXT: movd %xmm1, %r9d -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: subl %r9d, %esi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %r9d, %ecx -; SSE41-NEXT: cmovol %edi, %ecx -; SSE41-NEXT: pextrd $1, %xmm1, %r9d -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: subl %r9d, %edi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; SSE41-NEXT: subl %r9d, %eax -; SSE41-NEXT: cmovol %esi, %eax -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdx) -; SSE41-NEXT: retq +; SSE-LABEL: v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: psubsw %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-NEXT: vpextrd $3, %xmm0, %ecx -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpextrd $3, %xmm1, %r9d -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movl %r9d, %edi -; AVX1-NEXT: subl %ecx, %edi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %ecx, %r9d -; AVX1-NEXT: cmovol %esi, %r9d -; AVX1-NEXT: vpextrd $2, %xmm0, %r8d -; AVX1-NEXT: vpextrd $2, %xmm1, %r10d -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movl %r10d, %ecx -; AVX1-NEXT: subl %r8d, %ecx -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %r8d, %r10d -; AVX1-NEXT: cmovol %edi, %r10d -; AVX1-NEXT: vmovd %xmm0, %r8d -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: subl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %r8d, %eax -; AVX1-NEXT: cmovol %ecx, %eax -; AVX1-NEXT: vpextrd $1, %xmm0, %r8d -; AVX1-NEXT: vpextrd $1, %xmm1, %esi -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: movl %esi, %edi -; AVX1-NEXT: subl %r8d, %edi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX1-NEXT: subl %r8d, %esi -; AVX1-NEXT: cmovol %ecx, %esi -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpextrd $3, %xmm1, %r9d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl %r9d, %edi -; AVX2-NEXT: subl %ecx, %edi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %ecx, %r9d -; AVX2-NEXT: cmovol %esi, %r9d -; AVX2-NEXT: vpextrd $2, %xmm0, %r8d -; AVX2-NEXT: vpextrd $2, %xmm1, %r10d -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movl %r10d, %ecx -; AVX2-NEXT: subl %r8d, %ecx -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %r8d, %r10d -; AVX2-NEXT: cmovol %edi, %r10d -; AVX2-NEXT: vmovd %xmm0, %r8d -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: subl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %r8d, %eax -; AVX2-NEXT: cmovol %ecx, %eax -; AVX2-NEXT: vpextrd $1, %xmm0, %r8d -; AVX2-NEXT: vpextrd $1, %xmm1, %esi -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: subl %r8d, %edi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX2-NEXT: subl %r8d, %esi -; AVX2-NEXT: cmovol %ecx, %esi -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpextrd $3, %xmm0, %ecx -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $3, %xmm1, %r9d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl %r9d, %edi -; AVX512-NEXT: subl %ecx, %edi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %ecx, %r9d -; AVX512-NEXT: cmovol %esi, %r9d -; AVX512-NEXT: vpextrd $2, %xmm0, %r8d -; AVX512-NEXT: vpextrd $2, %xmm1, %r10d -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movl %r10d, %ecx -; AVX512-NEXT: subl %r8d, %ecx -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %r8d, %r10d -; AVX512-NEXT: cmovol %edi, %r10d -; AVX512-NEXT: vmovd %xmm0, %r8d -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %eax, %edi -; AVX512-NEXT: subl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %r8d, %eax -; AVX512-NEXT: cmovol %ecx, %eax -; AVX512-NEXT: vpextrd $1, %xmm0, %r8d -; AVX512-NEXT: vpextrd $1, %xmm1, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: subl %r8d, %edi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; AVX512-NEXT: subl %r8d, %esi -; AVX512-NEXT: cmovol %ecx, %esi -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vpmovdw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <4 x i16>, <4 x i16>* %px %y = load <4 x i16>, <4 x i16>* %py %z = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %x, <4 x i16> %y) - store <4 x i16> %z, <4 x i16>* %pz - ret void -} - -define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { -; SSE2-LABEL: v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $48, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: psllq $48, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movq %rcx, %rdi -; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: setns %sil -; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE2-NEXT: addq %r8, %rsi -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: cmovoq %rsi, %rcx -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: subq %r9, %rsi -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addq %r8, %rdi -; SSE2-NEXT: subq %r9, %rax -; SSE2-NEXT: cmovoq %rdi, %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlq $48, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movd %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v2i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm3, %rax -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %rcx -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movq %rcx, %rdi -; SSSE3-NEXT: subq %rax, %rdi -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSSE3-NEXT: addq %r8, %rsi -; SSSE3-NEXT: subq %rax, %rcx -; SSSE3-NEXT: cmovoq %rsi, %rcx -; SSSE3-NEXT: movq %xmm1, %r9 -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: subq %r9, %rsi -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addq %r8, %rdi -; SSSE3-NEXT: subq %r9, %rax -; SSSE3-NEXT: cmovoq %rdi, %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: movq %rcx, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movd %xmm0, (%rdx) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: psllq $48, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: psllq $48, %xmm0 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movq %rcx, %rdi -; SSE41-NEXT: subq %rax, %rdi -; SSE41-NEXT: setns %sil -; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; SSE41-NEXT: addq %r8, %rsi -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: cmovoq %rsi, %rcx -; SSE41-NEXT: pextrq $1, %xmm1, %r9 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: subq %r9, %rsi -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addq %r8, %rdi -; SSE41-NEXT: subq %r9, %rax -; SSE41-NEXT: cmovoq %rdi, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,14,15],zero,zero,xmm1[14,15],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movd %xmm1, (%rdx) -; SSE41-NEXT: retq + store <4 x i16> %z, <4 x i16>* %pz + ret void +} + +define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { +; SSE-LABEL: v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psubsw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdx) +; SSE-NEXT: retq ; ; AVX1-LABEL: v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: xorl %esi, %esi -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: subq %rax, %rdi -; AVX1-NEXT: setns %sil -; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX1-NEXT: addq %r8, %rsi -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: cmovoq %rsi, %rcx -; AVX1-NEXT: vpextrq $1, %xmm1, %r9 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: subq %r9, %rsi -; AVX1-NEXT: setns %dil -; AVX1-NEXT: addq %r8, %rdi -; AVX1-NEXT: subq %r9, %rax -; AVX1-NEXT: cmovoq %rdi, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, (%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movq %rcx, %rdi -; AVX2-NEXT: subq %rax, %rdi -; AVX2-NEXT: setns %sil -; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX2-NEXT: addq %r8, %rsi -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: cmovoq %rsi, %rcx -; AVX2-NEXT: vpextrq $1, %xmm1, %r9 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: xorl %edi, %edi -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: subq %r9, %rsi -; AVX2-NEXT: setns %dil -; AVX2-NEXT: addq %r8, %rdi -; AVX2-NEXT: subq %r9, %rax -; AVX2-NEXT: cmovoq %rdi, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -15170,32 +414,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movq %rcx, %rdi -; AVX512-NEXT: subq %rax, %rdi -; AVX512-NEXT: setns %sil -; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; AVX512-NEXT: addq %r8, %rsi -; AVX512-NEXT: subq %rax, %rcx -; AVX512-NEXT: cmovoq %rsi, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: xorl %edi, %edi -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: subq %r9, %rsi -; AVX512-NEXT: setns %dil -; AVX512-NEXT: addq %r8, %rdi -; AVX512-NEXT: subq %r9, %rax -; AVX512-NEXT: cmovoq %rdi, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vmovq %rcx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vpmovqw %xmm0, (%rdx) ; AVX512-NEXT: retq %x = load <2 x i16>, <2 x i16>* %px @@ -15206,1586 +426,59 @@ } define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { -; SSE2-LABEL: v12i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB11_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB11_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB11_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB11_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB11_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB11_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB11_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB11_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: jno .LBB11_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB11_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB11_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB11_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: jno .LBB11_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB11_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: jno .LBB11_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB11_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: jno .LBB11_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB11_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB11_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB11_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB11_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB11_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB11_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB11_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB11_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB11_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB11_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB11_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB11_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB11_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %sil -; SSE2-NEXT: jno .LBB11_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB11_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v12i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB11_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB11_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB11_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB11_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB11_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB11_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB11_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB11_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: jno .LBB11_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB11_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB11_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB11_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: jno .LBB11_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB11_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: jno .LBB11_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB11_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: jno .LBB11_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB11_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB11_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB11_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB11_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB11_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB11_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB11_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB11_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB11_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB11_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB11_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB11_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB11_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %sil -; SSSE3-NEXT: jno .LBB11_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB11_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v12i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB11_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB11_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: jno .LBB11_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB11_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB11_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB11_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: jno .LBB11_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB11_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB11_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB11_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r15b -; SSE41-NEXT: jno .LBB11_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB11_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: jno .LBB11_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB11_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: jno .LBB11_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB11_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB11_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB11_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r9b -; SSE41-NEXT: jno .LBB11_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB11_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB11_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB11_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB11_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB11_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: jno .LBB11_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB11_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: subb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: subb %bl, %cl -; SSE41-NEXT: jno .LBB11_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB11_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: jno .LBB11_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB11_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r8b -; SSE41-NEXT: jno .LBB11_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB11_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v12i8: +; SSE: # %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v12i8: ; AVX: # %bb.0: -; AVX-NEXT: vpextrb $15, %xmm1, %ecx -; AVX-NEXT: vpextrb $15, %xmm0, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dl -; AVX-NEXT: jno .LBB11_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: .LBB11_2: -; AVX-NEXT: vpextrb $14, %xmm1, %ecx -; AVX-NEXT: vpextrb $14, %xmm0, %r11d -; AVX-NEXT: movl %r11d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r11b -; AVX-NEXT: jno .LBB11_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: .LBB11_4: -; AVX-NEXT: vpextrb $13, %xmm1, %ecx -; AVX-NEXT: vpextrb $13, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dil -; AVX-NEXT: jno .LBB11_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB11_6: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrb $12, %xmm1, %ecx -; AVX-NEXT: vpextrb $12, %xmm0, %r14d -; AVX-NEXT: movl %r14d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r14b -; AVX-NEXT: jno .LBB11_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r14d -; AVX-NEXT: .LBB11_8: -; AVX-NEXT: vpextrb $11, %xmm1, %ecx -; AVX-NEXT: vpextrb $11, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %bpl -; AVX-NEXT: jno .LBB11_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB11_10: -; AVX-NEXT: vpextrb $10, %xmm1, %ecx -; AVX-NEXT: vpextrb $10, %xmm0, %r15d -; AVX-NEXT: movl %r15d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r15b -; AVX-NEXT: jno .LBB11_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r15d -; AVX-NEXT: .LBB11_12: -; AVX-NEXT: vpextrb $9, %xmm1, %ecx -; AVX-NEXT: vpextrb $9, %xmm0, %r12d -; AVX-NEXT: movl %r12d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r12b -; AVX-NEXT: jno .LBB11_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r12d -; AVX-NEXT: .LBB11_14: -; AVX-NEXT: vpextrb $8, %xmm1, %ecx -; AVX-NEXT: vpextrb $8, %xmm0, %r13d -; AVX-NEXT: movl %r13d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r13b -; AVX-NEXT: jno .LBB11_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r13d -; AVX-NEXT: .LBB11_16: -; AVX-NEXT: vpextrb $7, %xmm1, %ecx -; AVX-NEXT: vpextrb $7, %xmm0, %r10d -; AVX-NEXT: movl %r10d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r10b -; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB11_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r10d -; AVX-NEXT: .LBB11_18: -; AVX-NEXT: vpextrb $6, %xmm1, %ecx -; AVX-NEXT: vpextrb $6, %xmm0, %r9d -; AVX-NEXT: movl %r9d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r9b -; AVX-NEXT: jno .LBB11_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r9d -; AVX-NEXT: .LBB11_20: -; AVX-NEXT: vpextrb $5, %xmm1, %ecx -; AVX-NEXT: vpextrb $5, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %bpl -; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB11_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB11_22: -; AVX-NEXT: vpextrb $4, %xmm1, %ecx -; AVX-NEXT: vpextrb $4, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dil -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB11_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB11_24: -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: setns %cl -; AVX-NEXT: subb %dl, %al -; AVX-NEXT: jno .LBB11_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: addb $127, %cl -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB11_26: -; AVX-NEXT: vpextrb $2, %xmm1, %ebx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: subb %bl, %dl -; AVX-NEXT: setns %dl -; AVX-NEXT: subb %bl, %cl -; AVX-NEXT: jno .LBB11_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: addb $127, %dl -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: .LBB11_28: -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: subb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: subb %sil, %dl -; AVX-NEXT: jno .LBB11_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %edx -; AVX-NEXT: .LBB11_30: -; AVX-NEXT: vpextrb $1, %xmm1, %esi -; AVX-NEXT: vpextrb $1, %xmm0, %r8d -; AVX-NEXT: movl %r8d, %ebx -; AVX-NEXT: subb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: subb %sil, %r8b -; AVX-NEXT: jno .LBB11_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %r8d -; AVX-NEXT: .LBB11_32: -; AVX-NEXT: movzbl %dl, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: movzbl %r8b, %edx -; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %bpl, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r9b, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r10b, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r13b, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r12b, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r15b, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r14b, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r11b, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y) ret <12 x i8> %z } define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { -; SSE2-LABEL: v12i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: movdqa 16(%rsi), %xmm3 -; SSE2-NEXT: pextrw $3, %xmm3, %eax -; SSE2-NEXT: pextrw $3, %xmm2, %edx -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %edx, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %dx -; SSE2-NEXT: cmovol %ecx, %edx -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: pextrw $2, %xmm2, %r9d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r9d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r9w -; SSE2-NEXT: cmovol %ecx, %r9d -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: movd %xmm2, %r10d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r10d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r10w -; SSE2-NEXT: cmovol %ecx, %r10d -; SSE2-NEXT: pextrw $1, %xmm3, %eax -; SSE2-NEXT: pextrw $1, %xmm2, %r11d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r11d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r11w -; SSE2-NEXT: cmovol %ecx, %r11d -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movd %xmm0, %r14d -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %r14d, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r14w -; SSE2-NEXT: cmovol %ecx, %r14d -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: pextrw $1, %xmm0, %r15d -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %r15d, %edi -; SSE2-NEXT: subw %ax, %di -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r15w -; SSE2-NEXT: cmovol %esi, %r15d -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: pextrw $2, %xmm0, %r12d -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %r12d, %ebx -; SSE2-NEXT: subw %ax, %bx -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r12w -; SSE2-NEXT: cmovol %edi, %r12d -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: pextrw $3, %xmm0, %r13d -; SSE2-NEXT: xorl %ebx, %ebx -; SSE2-NEXT: movl %r13d, %ebp -; SSE2-NEXT: subw %ax, %bp -; SSE2-NEXT: setns %bl -; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %r13w -; SSE2-NEXT: cmovol %ebx, %r13d -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %ebx -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: movl %ebx, %ecx -; SSE2-NEXT: subw %ax, %cx -; SSE2-NEXT: setns %bpl -; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %bx -; SSE2-NEXT: cmovol %ebp, %ebx -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: pextrw $5, %xmm0, %ebp -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: movl %ebp, %esi -; SSE2-NEXT: subw %ax, %si -; SSE2-NEXT: setns %cl -; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE2-NEXT: subw %ax, %bp -; SSE2-NEXT: cmovol %ecx, %ebp -; SSE2-NEXT: pextrw $6, %xmm1, %ecx -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: subw %cx, %di -; SSE2-NEXT: setns %sil -; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE2-NEXT: subw %cx, %ax -; SSE2-NEXT: cmovol %esi, %eax -; SSE2-NEXT: pextrw $7, %xmm1, %ecx -; SSE2-NEXT: pextrw $7, %xmm0, %esi -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: movl %esi, %r8d -; SSE2-NEXT: subw %cx, %r8w -; SSE2-NEXT: setns %dil -; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE2-NEXT: subw %cx, %si -; SSE2-NEXT: cmovol %edi, %esi -; SSE2-NEXT: movd %esi, %xmm8 -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movd %ebp, %xmm2 -; SSE2-NEXT: movd %ebx, %xmm3 -; SSE2-NEXT: movd %r13d, %xmm4 -; SSE2-NEXT: movd %r12d, %xmm5 -; SSE2-NEXT: movd %r15d, %xmm6 -; SSE2-NEXT: movd %r14d, %xmm7 -; SSE2-NEXT: movd %r10d, %xmm0 -; SSE2-NEXT: pinsrw $1, %r11d, %xmm0 -; SSE2-NEXT: pinsrw $2, %r9d, %xmm0 -; SSE2-NEXT: pinsrw $3, %edx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: movq %xmm0, 16(%rax) -; SSE2-NEXT: movdqa %xmm7, (%rax) -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq +; SSE-LABEL: v12i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: psubsw (%rsi), %xmm0 +; SSE-NEXT: psubsw 16(%rsi), %xmm1 +; SSE-NEXT: movq %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: retq ; -; SSSE3-LABEL: v12i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSSE3-NEXT: movdqa (%rdi), %xmm0 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 -; SSSE3-NEXT: movdqa (%rsi), %xmm1 -; SSSE3-NEXT: movdqa 16(%rsi), %xmm3 -; SSSE3-NEXT: pextrw $3, %xmm3, %eax -; SSSE3-NEXT: pextrw $3, %xmm2, %edx -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %edx, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %dx -; SSSE3-NEXT: cmovol %ecx, %edx -; SSSE3-NEXT: pextrw $2, %xmm3, %eax -; SSSE3-NEXT: pextrw $2, %xmm2, %r9d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r9d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r9w -; SSSE3-NEXT: cmovol %ecx, %r9d -; SSSE3-NEXT: movd %xmm3, %eax -; SSSE3-NEXT: movd %xmm2, %r10d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r10d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r10w -; SSSE3-NEXT: cmovol %ecx, %r10d -; SSSE3-NEXT: pextrw $1, %xmm3, %eax -; SSSE3-NEXT: pextrw $1, %xmm2, %r11d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r11d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r11w -; SSSE3-NEXT: cmovol %ecx, %r11d -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movd %xmm0, %r14d -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %r14d, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r14w -; SSSE3-NEXT: cmovol %ecx, %r14d -; SSSE3-NEXT: pextrw $1, %xmm1, %eax -; SSSE3-NEXT: pextrw $1, %xmm0, %r15d -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %r15d, %edi -; SSSE3-NEXT: subw %ax, %di -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r15w -; SSSE3-NEXT: cmovol %esi, %r15d -; SSSE3-NEXT: pextrw $2, %xmm1, %eax -; SSSE3-NEXT: pextrw $2, %xmm0, %r12d -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %r12d, %ebx -; SSSE3-NEXT: subw %ax, %bx -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r12w -; SSSE3-NEXT: cmovol %edi, %r12d -; SSSE3-NEXT: pextrw $3, %xmm1, %eax -; SSSE3-NEXT: pextrw $3, %xmm0, %r13d -; SSSE3-NEXT: xorl %ebx, %ebx -; SSSE3-NEXT: movl %r13d, %ebp -; SSSE3-NEXT: subw %ax, %bp -; SSSE3-NEXT: setns %bl -; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %r13w -; SSSE3-NEXT: cmovol %ebx, %r13d -; SSSE3-NEXT: pextrw $4, %xmm1, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %ebx -; SSSE3-NEXT: xorl %ebp, %ebp -; SSSE3-NEXT: movl %ebx, %ecx -; SSSE3-NEXT: subw %ax, %cx -; SSSE3-NEXT: setns %bpl -; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %bx -; SSSE3-NEXT: cmovol %ebp, %ebx -; SSSE3-NEXT: pextrw $5, %xmm1, %eax -; SSSE3-NEXT: pextrw $5, %xmm0, %ebp -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: movl %ebp, %esi -; SSSE3-NEXT: subw %ax, %si -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSSE3-NEXT: subw %ax, %bp -; SSSE3-NEXT: cmovol %ecx, %ebp -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: subw %cx, %di -; SSSE3-NEXT: setns %sil -; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSSE3-NEXT: subw %cx, %ax -; SSSE3-NEXT: cmovol %esi, %eax -; SSSE3-NEXT: pextrw $7, %xmm1, %ecx -; SSSE3-NEXT: pextrw $7, %xmm0, %esi -; SSSE3-NEXT: xorl %edi, %edi -; SSSE3-NEXT: movl %esi, %r8d -; SSSE3-NEXT: subw %cx, %r8w -; SSSE3-NEXT: setns %dil -; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSSE3-NEXT: subw %cx, %si -; SSSE3-NEXT: cmovol %edi, %esi -; SSSE3-NEXT: movd %esi, %xmm8 -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movd %ebp, %xmm2 -; SSSE3-NEXT: movd %ebx, %xmm3 -; SSSE3-NEXT: movd %r13d, %xmm4 -; SSSE3-NEXT: movd %r12d, %xmm5 -; SSSE3-NEXT: movd %r15d, %xmm6 -; SSSE3-NEXT: movd %r14d, %xmm7 -; SSSE3-NEXT: movd %r10d, %xmm0 -; SSSE3-NEXT: pinsrw $1, %r11d, %xmm0 -; SSSE3-NEXT: pinsrw $2, %r9d, %xmm0 -; SSSE3-NEXT: pinsrw $3, %edx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSSE3-NEXT: movq %xmm0, 16(%rax) -; SSSE3-NEXT: movdqa %xmm7, (%rax) -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq +; AVX1-LABEL: v12i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpsubsw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: retq ; -; SSE41-LABEL: v12i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa (%rsi), %xmm1 -; SSE41-NEXT: movdqa 16(%rsi), %xmm3 -; SSE41-NEXT: pextrw $3, %xmm3, %eax -; SSE41-NEXT: pextrw $3, %xmm2, %edx -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %edx, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %dx -; SSE41-NEXT: cmovol %ecx, %edx -; SSE41-NEXT: pextrw $2, %xmm3, %eax -; SSE41-NEXT: pextrw $2, %xmm2, %r9d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r9d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r9w -; SSE41-NEXT: cmovol %ecx, %r9d -; SSE41-NEXT: movd %xmm3, %eax -; SSE41-NEXT: movd %xmm2, %r10d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r10d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r10w -; SSE41-NEXT: cmovol %ecx, %r10d -; SSE41-NEXT: pextrw $1, %xmm3, %eax -; SSE41-NEXT: pextrw $1, %xmm2, %r11d -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %r11d, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %r11w -; SSE41-NEXT: cmovol %ecx, %r11d -; SSE41-NEXT: pextrw $7, %xmm1, %ecx -; SSE41-NEXT: pextrw $7, %xmm0, %r14d -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %r14d, %edi -; SSE41-NEXT: subw %cx, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: subw %cx, %r14w -; SSE41-NEXT: cmovol %esi, %r14d -; SSE41-NEXT: pextrw $6, %xmm1, %esi -; SSE41-NEXT: pextrw $6, %xmm0, %r15d -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %r15d, %ebx -; SSE41-NEXT: subw %si, %bx -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: subw %si, %r15w -; SSE41-NEXT: cmovol %edi, %r15d -; SSE41-NEXT: pextrw $5, %xmm1, %edi -; SSE41-NEXT: pextrw $5, %xmm0, %r12d -; SSE41-NEXT: xorl %ebx, %ebx -; SSE41-NEXT: movl %r12d, %ebp -; SSE41-NEXT: subw %di, %bp -; SSE41-NEXT: setns %bl -; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF -; SSE41-NEXT: subw %di, %r12w -; SSE41-NEXT: cmovol %ebx, %r12d -; SSE41-NEXT: pextrw $4, %xmm1, %ebx -; SSE41-NEXT: pextrw $4, %xmm0, %r13d -; SSE41-NEXT: xorl %ebp, %ebp -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subw %bx, %ax -; SSE41-NEXT: setns %bpl -; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE41-NEXT: subw %bx, %r13w -; SSE41-NEXT: cmovol %ebp, %r13d -; SSE41-NEXT: pextrw $3, %xmm1, %eax -; SSE41-NEXT: pextrw $3, %xmm0, %ebx -; SSE41-NEXT: xorl %ebp, %ebp -; SSE41-NEXT: movl %ebx, %ecx -; SSE41-NEXT: subw %ax, %cx -; SSE41-NEXT: setns %bpl -; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %bx -; SSE41-NEXT: cmovol %ebp, %ebx -; SSE41-NEXT: pextrw $2, %xmm1, %eax -; SSE41-NEXT: pextrw $2, %xmm0, %ebp -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: movl %ebp, %esi -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: setns %cl -; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %bp -; SSE41-NEXT: cmovol %ecx, %ebp -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: xorl %esi, %esi -; SSE41-NEXT: movl %ecx, %edi -; SSE41-NEXT: subw %ax, %di -; SSE41-NEXT: setns %sil -; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %cx -; SSE41-NEXT: cmovol %esi, %ecx -; SSE41-NEXT: pextrw $1, %xmm1, %eax -; SSE41-NEXT: pextrw $1, %xmm0, %esi -; SSE41-NEXT: xorl %edi, %edi -; SSE41-NEXT: movl %esi, %r8d -; SSE41-NEXT: subw %ax, %r8w -; SSE41-NEXT: setns %dil -; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF -; SSE41-NEXT: subw %ax, %si -; SSE41-NEXT: cmovol %edi, %esi -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrw $1, %esi, %xmm0 -; SSE41-NEXT: pinsrw $2, %ebp, %xmm0 -; SSE41-NEXT: pinsrw $3, %ebx, %xmm0 -; SSE41-NEXT: pinsrw $4, %r13d, %xmm0 -; SSE41-NEXT: pinsrw $5, %r12d, %xmm0 -; SSE41-NEXT: pinsrw $6, %r15d, %xmm0 -; SSE41-NEXT: pinsrw $7, %r14d, %xmm0 -; SSE41-NEXT: movd %r10d, %xmm1 -; SSE41-NEXT: pinsrw $1, %r11d, %xmm1 -; SSE41-NEXT: pinsrw $2, %r9d, %xmm1 -; SSE41-NEXT: pinsrw $3, %edx, %xmm1 -; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE41-NEXT: movq %xmm1, 16(%rax) -; SSE41-NEXT: movdqa %xmm0, (%rax) -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; AVX2-LABEL: v12i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpsubsw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rdx) +; AVX2-NEXT: vmovdqa %xmm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; AVX-LABEL: v12i16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: vmovdqa (%rsi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX-NEXT: vmovd %xmm2, %eax -; AVX-NEXT: vmovdqa (%rdi), %xmm3 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovd %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $1, %xmm2, %eax -; AVX-NEXT: vpextrw $1, %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $2, %xmm2, %eax -; AVX-NEXT: vpextrw $2, %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $3, %xmm2, %eax -; AVX-NEXT: vpextrw $3, %xmm3, %edx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %dx -; AVX-NEXT: cmovol %ecx, %edx -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: vpextrw $4, %xmm2, %eax -; AVX-NEXT: vpextrw $4, %xmm3, %r14d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r14d, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r14w -; AVX-NEXT: cmovol %ecx, %r14d -; AVX-NEXT: vpextrw $5, %xmm2, %eax -; AVX-NEXT: vpextrw $5, %xmm3, %r15d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r15d, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r15w -; AVX-NEXT: cmovol %ecx, %r15d -; AVX-NEXT: vpextrw $6, %xmm2, %eax -; AVX-NEXT: vpextrw $6, %xmm3, %r12d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r12d, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r12w -; AVX-NEXT: cmovol %ecx, %r12d -; AVX-NEXT: vpextrw $7, %xmm2, %eax -; AVX-NEXT: vpextrw $7, %xmm3, %r13d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %r13d, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %r13w -; AVX-NEXT: cmovol %ecx, %r13d -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: vpextrw $7, %xmm1, %ebx -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %ebx, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %bx -; AVX-NEXT: cmovol %ecx, %ebx -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: vpextrw $6, %xmm1, %ebp -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: movl %ebp, %esi -; AVX-NEXT: subw %ax, %si -; AVX-NEXT: setns %cl -; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF -; AVX-NEXT: subw %ax, %bp -; AVX-NEXT: cmovol %ecx, %ebp -; AVX-NEXT: vpextrw $5, %xmm0, %ecx -; AVX-NEXT: vpextrw $5, %xmm1, %eax -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: subw %cx, %di -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: subw %cx, %ax -; AVX-NEXT: cmovol %esi, %eax -; AVX-NEXT: vpextrw $4, %xmm0, %esi -; AVX-NEXT: vpextrw $4, %xmm1, %ecx -; AVX-NEXT: xorl %edi, %edi -; AVX-NEXT: movl %ecx, %r8d -; AVX-NEXT: subw %si, %r8w -; AVX-NEXT: setns %dil -; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF -; AVX-NEXT: subw %si, %cx -; AVX-NEXT: cmovol %edi, %ecx -; AVX-NEXT: vpextrw $3, %xmm0, %edi -; AVX-NEXT: vpextrw $3, %xmm1, %r8d -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %r8d, %edx -; AVX-NEXT: subw %di, %dx -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: subw %di, %r8w -; AVX-NEXT: cmovol %esi, %r8d -; AVX-NEXT: vpextrw $2, %xmm0, %edx -; AVX-NEXT: vpextrw $2, %xmm1, %edi -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: movl %edi, %r9d -; AVX-NEXT: subw %dx, %r9w -; AVX-NEXT: setns %sil -; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF -; AVX-NEXT: subw %dx, %di -; AVX-NEXT: cmovol %esi, %edi -; AVX-NEXT: vmovd %xmm0, %r9d -; AVX-NEXT: vmovd %xmm1, %esi -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: movl %esi, %r10d -; AVX-NEXT: subw %r9w, %r10w -; AVX-NEXT: setns %dl -; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF -; AVX-NEXT: subw %r9w, %si -; AVX-NEXT: cmovol %edx, %esi -; AVX-NEXT: vpextrw $1, %xmm0, %r9d -; AVX-NEXT: vpextrw $1, %xmm1, %edx -; AVX-NEXT: xorl %r10d, %r10d -; AVX-NEXT: movl %edx, %r11d -; AVX-NEXT: subw %r9w, %r11w -; AVX-NEXT: setns %r10b -; AVX-NEXT: addl $32767, %r10d # imm = 0x7FFF -; AVX-NEXT: subw %r9w, %dx -; AVX-NEXT: cmovol %r10d, %edx -; AVX-NEXT: vmovd %esi, %xmm0 -; AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 -; AVX-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; AVX-NEXT: vpinsrw $4, %r14d, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $5, %r15d, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $6, %r12d, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $7, %r13d, %xmm1, %xmm1 -; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX-NEXT: vmovq %xmm0, 16(%rax) -; AVX-NEXT: vmovdqa %xmm1, (%rax) -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX512-LABEL: v12i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpsubsw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = load <12 x i16>, <12 x i16>* %px %y = load <12 x i16>, <12 x i16>* %py %z = call <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16> %x, <12 x i16> %y) @@ -16872,1960 +565,63 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE2-LABEL: v16i4: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB15_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB15_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB15_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB15_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB15_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB15_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB15_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB15_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: jno .LBB15_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB15_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB15_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB15_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: jno .LBB15_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB15_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: jno .LBB15_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB15_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: jno .LBB15_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB15_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB15_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB15_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB15_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB15_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB15_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB15_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB15_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB15_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB15_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB15_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB15_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB15_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %sil -; SSE2-NEXT: jno .LBB15_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB15_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $4, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB15_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB15_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB15_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB15_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB15_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB15_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB15_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB15_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: jno .LBB15_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB15_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB15_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB15_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: jno .LBB15_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB15_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: jno .LBB15_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB15_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: jno .LBB15_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB15_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB15_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB15_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB15_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB15_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB15_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB15_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB15_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB15_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB15_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB15_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB15_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB15_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %sil -; SSSE3-NEXT: jno .LBB15_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB15_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: psubb %xmm1, %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i4: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: psllw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB15_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB15_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: jno .LBB15_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB15_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB15_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB15_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: jno .LBB15_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB15_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB15_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB15_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r15b -; SSE41-NEXT: jno .LBB15_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB15_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: jno .LBB15_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB15_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: jno .LBB15_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB15_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB15_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB15_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r9b -; SSE41-NEXT: jno .LBB15_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB15_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB15_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB15_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB15_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB15_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: jno .LBB15_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB15_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: subb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: subb %bl, %cl -; SSE41-NEXT: jno .LBB15_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB15_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: jno .LBB15_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB15_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r8b -; SSE41-NEXT: jno .LBB15_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB15_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: psubb %xmm1, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $4, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psubsb %xmm1, %xmm0 +; SSE-NEXT: psrlw $4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $15, %xmm1, %ecx ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $15, %xmm0, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dl -; AVX-NEXT: jno .LBB15_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: .LBB15_2: -; AVX-NEXT: vpextrb $14, %xmm1, %ecx -; AVX-NEXT: vpextrb $14, %xmm0, %r11d -; AVX-NEXT: movl %r11d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r11b -; AVX-NEXT: jno .LBB15_4 -; AVX-NEXT: # %bb.3: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: .LBB15_4: -; AVX-NEXT: vpextrb $13, %xmm1, %ecx -; AVX-NEXT: vpextrb $13, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dil -; AVX-NEXT: jno .LBB15_6 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB15_6: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vpextrb $12, %xmm1, %ecx -; AVX-NEXT: vpextrb $12, %xmm0, %r14d -; AVX-NEXT: movl %r14d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r14b -; AVX-NEXT: jno .LBB15_8 -; AVX-NEXT: # %bb.7: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r14d -; AVX-NEXT: .LBB15_8: -; AVX-NEXT: vpextrb $11, %xmm1, %ecx -; AVX-NEXT: vpextrb $11, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %bpl -; AVX-NEXT: jno .LBB15_10 -; AVX-NEXT: # %bb.9: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB15_10: -; AVX-NEXT: vpextrb $10, %xmm1, %ecx -; AVX-NEXT: vpextrb $10, %xmm0, %r15d -; AVX-NEXT: movl %r15d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r15b -; AVX-NEXT: jno .LBB15_12 -; AVX-NEXT: # %bb.11: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r15d -; AVX-NEXT: .LBB15_12: -; AVX-NEXT: vpextrb $9, %xmm1, %ecx -; AVX-NEXT: vpextrb $9, %xmm0, %r12d -; AVX-NEXT: movl %r12d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r12b -; AVX-NEXT: jno .LBB15_14 -; AVX-NEXT: # %bb.13: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r12d -; AVX-NEXT: .LBB15_14: -; AVX-NEXT: vpextrb $8, %xmm1, %ecx -; AVX-NEXT: vpextrb $8, %xmm0, %r13d -; AVX-NEXT: movl %r13d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r13b -; AVX-NEXT: jno .LBB15_16 -; AVX-NEXT: # %bb.15: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r13d -; AVX-NEXT: .LBB15_16: -; AVX-NEXT: vpextrb $7, %xmm1, %ecx -; AVX-NEXT: vpextrb $7, %xmm0, %r10d -; AVX-NEXT: movl %r10d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r10b -; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB15_18 -; AVX-NEXT: # %bb.17: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r10d -; AVX-NEXT: .LBB15_18: -; AVX-NEXT: vpextrb $6, %xmm1, %ecx -; AVX-NEXT: vpextrb $6, %xmm0, %r9d -; AVX-NEXT: movl %r9d, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %r9b -; AVX-NEXT: jno .LBB15_20 -; AVX-NEXT: # %bb.19: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %r9d -; AVX-NEXT: .LBB15_20: -; AVX-NEXT: vpextrb $5, %xmm1, %ecx -; AVX-NEXT: vpextrb $5, %xmm0, %ebp -; AVX-NEXT: movl %ebp, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %bpl -; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB15_22 -; AVX-NEXT: # %bb.21: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: .LBB15_22: -; AVX-NEXT: vpextrb $4, %xmm1, %ecx -; AVX-NEXT: vpextrb $4, %xmm0, %edi -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: setns %al -; AVX-NEXT: subb %cl, %dil -; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: jno .LBB15_24 -; AVX-NEXT: # %bb.23: -; AVX-NEXT: addb $127, %al -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: .LBB15_24: -; AVX-NEXT: vpextrb $3, %xmm1, %edx -; AVX-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: subb %dl, %cl -; AVX-NEXT: setns %cl -; AVX-NEXT: subb %dl, %al -; AVX-NEXT: jno .LBB15_26 -; AVX-NEXT: # %bb.25: -; AVX-NEXT: addb $127, %cl -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: .LBB15_26: -; AVX-NEXT: vpextrb $2, %xmm1, %ebx -; AVX-NEXT: vpextrb $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: subb %bl, %dl -; AVX-NEXT: setns %dl -; AVX-NEXT: subb %bl, %cl -; AVX-NEXT: jno .LBB15_28 -; AVX-NEXT: # %bb.27: -; AVX-NEXT: addb $127, %dl -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: .LBB15_28: -; AVX-NEXT: vpextrb $0, %xmm1, %esi -; AVX-NEXT: vpextrb $0, %xmm0, %edx -; AVX-NEXT: movl %edx, %ebx -; AVX-NEXT: subb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: subb %sil, %dl -; AVX-NEXT: jno .LBB15_30 -; AVX-NEXT: # %bb.29: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %edx -; AVX-NEXT: .LBB15_30: -; AVX-NEXT: vpextrb $1, %xmm1, %esi -; AVX-NEXT: vpextrb $1, %xmm0, %r8d -; AVX-NEXT: movl %r8d, %ebx -; AVX-NEXT: subb %sil, %bl -; AVX-NEXT: setns %bl -; AVX-NEXT: subb %sil, %r8b -; AVX-NEXT: jno .LBB15_32 -; AVX-NEXT: # %bb.31: -; AVX-NEXT: addb $127, %bl -; AVX-NEXT: movl %ebx, %r8d -; AVX-NEXT: .LBB15_32: -; AVX-NEXT: movzbl %dl, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: movzbl %r8b, %edx -; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %bpl, %eax -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r9b, %eax -; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r10b, %eax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r13b, %eax -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r12b, %eax -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r15b, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r14b, %eax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl %r11b, %eax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp ; AVX-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE2-LABEL: v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: psllw $7, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %r9d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r9b -; SSE2-NEXT: jno .LBB16_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r9d -; SSE2-NEXT: .LBB16_2: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: jno .LBB16_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %esi -; SSE2-NEXT: .LBB16_4: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB16_6 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB16_6: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: jno .LBB16_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: .LBB16_8: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSE2-NEXT: movl %r10d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r10b -; SSE2-NEXT: jno .LBB16_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r10d -; SSE2-NEXT: .LBB16_10: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSE2-NEXT: movl %r11d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r11b -; SSE2-NEXT: jno .LBB16_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r11d -; SSE2-NEXT: .LBB16_12: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSE2-NEXT: movl %ebp, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bpl -; SSE2-NEXT: jno .LBB16_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebp -; SSE2-NEXT: .LBB16_14: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r14b -; SSE2-NEXT: jno .LBB16_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r14d -; SSE2-NEXT: .LBB16_16: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r15b -; SSE2-NEXT: jno .LBB16_18 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r15d -; SSE2-NEXT: .LBB16_18: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r12b -; SSE2-NEXT: jno .LBB16_20 -; SSE2-NEXT: # %bb.19: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r12d -; SSE2-NEXT: .LBB16_20: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSE2-NEXT: movl %r13d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r13b -; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB16_22 -; SSE2-NEXT: # %bb.21: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r13d -; SSE2-NEXT: .LBB16_22: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %dil -; SSE2-NEXT: jno .LBB16_24 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %edi -; SSE2-NEXT: .LBB16_24: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSE2-NEXT: movl %r8d, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %r8b -; SSE2-NEXT: jno .LBB16_26 -; SSE2-NEXT: # %bb.25: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %r8d -; SSE2-NEXT: .LBB16_26: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSE2-NEXT: movl %ebx, %eax -; SSE2-NEXT: subb %cl, %al -; SSE2-NEXT: setns %al -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: jno .LBB16_28 -; SSE2-NEXT: # %bb.27: -; SSE2-NEXT: addb $127, %al -; SSE2-NEXT: movl %eax, %ebx -; SSE2-NEXT: .LBB16_28: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %al -; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: jno .LBB16_30 -; SSE2-NEXT: # %bb.29: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: .LBB16_30: -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSE2-NEXT: movl %esi, %ecx -; SSE2-NEXT: subb %dl, %cl -; SSE2-NEXT: setns %cl -; SSE2-NEXT: subb %dl, %sil -; SSE2-NEXT: jno .LBB16_32 -; SSE2-NEXT: # %bb.31: -; SSE2-NEXT: addb $127, %cl -; SSE2-NEXT: movl %ecx, %esi -; SSE2-NEXT: .LBB16_32: -; SSE2-NEXT: movzbl %sil, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %bl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl %r13b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl %r12b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movzbl %r15b, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movzbl %r14b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %bpl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movzbl %r11b, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %r10b, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %r9b, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm0 -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v16i1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: psllw $7, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: psllw $7, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %r9d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r9b -; SSSE3-NEXT: jno .LBB16_2 -; SSSE3-NEXT: # %bb.1: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r9d -; SSSE3-NEXT: .LBB16_2: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movl %esi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %sil -; SSSE3-NEXT: jno .LBB16_4 -; SSSE3-NEXT: # %bb.3: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %esi -; SSSE3-NEXT: .LBB16_4: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB16_6 -; SSSE3-NEXT: # %bb.5: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB16_6: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movl %edx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dl -; SSSE3-NEXT: jno .LBB16_8 -; SSSE3-NEXT: # %bb.7: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edx -; SSSE3-NEXT: .LBB16_8: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b -; SSSE3-NEXT: movl %r10d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r10b -; SSSE3-NEXT: jno .LBB16_10 -; SSSE3-NEXT: # %bb.9: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r10d -; SSSE3-NEXT: .LBB16_10: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b -; SSSE3-NEXT: movl %r11d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r11b -; SSSE3-NEXT: jno .LBB16_12 -; SSSE3-NEXT: # %bb.11: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r11d -; SSSE3-NEXT: .LBB16_12: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl -; SSSE3-NEXT: movl %ebp, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bpl -; SSSE3-NEXT: jno .LBB16_14 -; SSSE3-NEXT: # %bb.13: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebp -; SSSE3-NEXT: .LBB16_14: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b -; SSSE3-NEXT: movl %r14d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r14b -; SSSE3-NEXT: jno .LBB16_16 -; SSSE3-NEXT: # %bb.15: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r14d -; SSSE3-NEXT: .LBB16_16: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b -; SSSE3-NEXT: movl %r15d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r15b -; SSSE3-NEXT: jno .LBB16_18 -; SSSE3-NEXT: # %bb.17: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r15d -; SSSE3-NEXT: .LBB16_18: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b -; SSSE3-NEXT: movl %r12d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r12b -; SSSE3-NEXT: jno .LBB16_20 -; SSSE3-NEXT: # %bb.19: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r12d -; SSSE3-NEXT: .LBB16_20: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b -; SSSE3-NEXT: movl %r13d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r13b -; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB16_22 -; SSSE3-NEXT: # %bb.21: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r13d -; SSSE3-NEXT: .LBB16_22: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil -; SSSE3-NEXT: movl %edi, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %dil -; SSSE3-NEXT: jno .LBB16_24 -; SSSE3-NEXT: # %bb.23: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %edi -; SSSE3-NEXT: .LBB16_24: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b -; SSSE3-NEXT: movl %r8d, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %r8b -; SSSE3-NEXT: jno .LBB16_26 -; SSSE3-NEXT: # %bb.25: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %r8d -; SSSE3-NEXT: .LBB16_26: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl -; SSSE3-NEXT: movl %ebx, %eax -; SSSE3-NEXT: subb %cl, %al -; SSSE3-NEXT: setns %al -; SSSE3-NEXT: subb %cl, %bl -; SSSE3-NEXT: jno .LBB16_28 -; SSSE3-NEXT: # %bb.27: -; SSSE3-NEXT: addb $127, %al -; SSSE3-NEXT: movl %eax, %ebx -; SSSE3-NEXT: .LBB16_28: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %al -; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSSE3-NEXT: jno .LBB16_30 -; SSSE3-NEXT: # %bb.29: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: .LBB16_30: -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil -; SSSE3-NEXT: movl %esi, %ecx -; SSSE3-NEXT: subb %dl, %cl -; SSSE3-NEXT: setns %cl -; SSSE3-NEXT: subb %dl, %sil -; SSSE3-NEXT: jno .LBB16_32 -; SSSE3-NEXT: # %bb.31: -; SSSE3-NEXT: addb $127, %cl -; SSSE3-NEXT: movl %ecx, %esi -; SSSE3-NEXT: .LBB16_32: -; SSSE3-NEXT: movzbl %sil, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %bl, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl %r13b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movzbl %r12b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movzbl %r15b, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movzbl %r14b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %bpl, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: movzbl %r11b, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %r10b, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %r9b, %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pcmpgtb %xmm4, %xmm0 -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: psllw $7, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pextrb $15, %xmm1, %ecx -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pextrb $15, %xmm0, %edx -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dl -; SSE41-NEXT: jno .LBB16_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: .LBB16_2: -; SSE41-NEXT: pextrb $14, %xmm1, %ecx -; SSE41-NEXT: pextrb $14, %xmm0, %r11d -; SSE41-NEXT: movl %r11d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r11b -; SSE41-NEXT: jno .LBB16_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r11d -; SSE41-NEXT: .LBB16_4: -; SSE41-NEXT: pextrb $13, %xmm1, %ecx -; SSE41-NEXT: pextrb $13, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: jno .LBB16_6 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB16_6: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: pextrb $12, %xmm1, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %r14d -; SSE41-NEXT: movl %r14d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r14b -; SSE41-NEXT: jno .LBB16_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r14d -; SSE41-NEXT: .LBB16_8: -; SSE41-NEXT: pextrb $11, %xmm1, %ecx -; SSE41-NEXT: pextrb $11, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: jno .LBB16_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB16_10: -; SSE41-NEXT: pextrb $10, %xmm1, %ecx -; SSE41-NEXT: pextrb $10, %xmm0, %r15d -; SSE41-NEXT: movl %r15d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r15b -; SSE41-NEXT: jno .LBB16_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r15d -; SSE41-NEXT: .LBB16_12: -; SSE41-NEXT: pextrb $9, %xmm1, %ecx -; SSE41-NEXT: pextrb $9, %xmm0, %r12d -; SSE41-NEXT: movl %r12d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r12b -; SSE41-NEXT: jno .LBB16_14 -; SSE41-NEXT: # %bb.13: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r12d -; SSE41-NEXT: .LBB16_14: -; SSE41-NEXT: pextrb $8, %xmm1, %ecx -; SSE41-NEXT: pextrb $8, %xmm0, %r13d -; SSE41-NEXT: movl %r13d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r13b -; SSE41-NEXT: jno .LBB16_16 -; SSE41-NEXT: # %bb.15: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r13d -; SSE41-NEXT: .LBB16_16: -; SSE41-NEXT: pextrb $7, %xmm1, %ecx -; SSE41-NEXT: pextrb $7, %xmm0, %r10d -; SSE41-NEXT: movl %r10d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r10b -; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB16_18 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r10d -; SSE41-NEXT: .LBB16_18: -; SSE41-NEXT: pextrb $6, %xmm1, %ecx -; SSE41-NEXT: pextrb $6, %xmm0, %r9d -; SSE41-NEXT: movl %r9d, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %r9b -; SSE41-NEXT: jno .LBB16_20 -; SSE41-NEXT: # %bb.19: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %r9d -; SSE41-NEXT: .LBB16_20: -; SSE41-NEXT: pextrb $5, %xmm1, %ecx -; SSE41-NEXT: pextrb $5, %xmm0, %ebp -; SSE41-NEXT: movl %ebp, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %bpl -; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB16_22 -; SSE41-NEXT: # %bb.21: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %ebp -; SSE41-NEXT: .LBB16_22: -; SSE41-NEXT: pextrb $4, %xmm1, %ecx -; SSE41-NEXT: pextrb $4, %xmm0, %edi -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: subb %cl, %al -; SSE41-NEXT: setns %al -; SSE41-NEXT: subb %cl, %dil -; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE41-NEXT: jno .LBB16_24 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: addb $127, %al -; SSE41-NEXT: movl %eax, %edi -; SSE41-NEXT: .LBB16_24: -; SSE41-NEXT: pextrb $3, %xmm1, %edx -; SSE41-NEXT: pextrb $3, %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: subb %dl, %cl -; SSE41-NEXT: setns %cl -; SSE41-NEXT: subb %dl, %al -; SSE41-NEXT: jno .LBB16_26 -; SSE41-NEXT: # %bb.25: -; SSE41-NEXT: addb $127, %cl -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: .LBB16_26: -; SSE41-NEXT: pextrb $2, %xmm1, %ebx -; SSE41-NEXT: pextrb $2, %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %edx -; SSE41-NEXT: subb %bl, %dl -; SSE41-NEXT: setns %dl -; SSE41-NEXT: subb %bl, %cl -; SSE41-NEXT: jno .LBB16_28 -; SSE41-NEXT: # %bb.27: -; SSE41-NEXT: addb $127, %dl -; SSE41-NEXT: movl %edx, %ecx -; SSE41-NEXT: .LBB16_28: -; SSE41-NEXT: pextrb $0, %xmm1, %esi -; SSE41-NEXT: pextrb $0, %xmm0, %edx -; SSE41-NEXT: movl %edx, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %dl -; SSE41-NEXT: jno .LBB16_30 -; SSE41-NEXT: # %bb.29: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %edx -; SSE41-NEXT: .LBB16_30: -; SSE41-NEXT: pextrb $1, %xmm1, %esi -; SSE41-NEXT: pextrb $1, %xmm0, %r8d -; SSE41-NEXT: movl %r8d, %ebx -; SSE41-NEXT: subb %sil, %bl -; SSE41-NEXT: setns %bl -; SSE41-NEXT: subb %sil, %r8b -; SSE41-NEXT: jno .LBB16_32 -; SSE41-NEXT: # %bb.31: -; SSE41-NEXT: addb $127, %bl -; SSE41-NEXT: movl %ebx, %r8d -; SSE41-NEXT: .LBB16_32: -; SSE41-NEXT: movzbl %dl, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: movzbl %r8b, %edx -; SSE41-NEXT: pinsrb $1, %edx, %xmm1 -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pinsrb $3, %eax, %xmm1 -; SSE41-NEXT: movzbl %dil, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm1 -; SSE41-NEXT: movzbl %bpl, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm1 -; SSE41-NEXT: movzbl %r9b, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm1 -; SSE41-NEXT: movzbl %r10b, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm1 -; SSE41-NEXT: movzbl %r13b, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm1 -; SSE41-NEXT: movzbl %r12b, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm1 -; SSE41-NEXT: movzbl %r15b, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $11, %eax, %xmm1 -; SSE41-NEXT: movzbl %r14b, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $13, %eax, %xmm1 -; SSE41-NEXT: movzbl %r11b, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp -; SSE41-NEXT: retq +; SSE-LABEL: v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psubsb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %edx -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dl -; AVX1-NEXT: jno .LBB16_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: .LBB16_2: -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx -; AVX1-NEXT: vpextrb $14, %xmm0, %r11d -; AVX1-NEXT: movl %r11d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r11b -; AVX1-NEXT: jno .LBB16_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r11d -; AVX1-NEXT: .LBB16_4: -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx -; AVX1-NEXT: vpextrb $13, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: jno .LBB16_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB16_6: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx -; AVX1-NEXT: vpextrb $12, %xmm0, %r14d -; AVX1-NEXT: movl %r14d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r14b -; AVX1-NEXT: jno .LBB16_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: .LBB16_8: -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx -; AVX1-NEXT: vpextrb $11, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: jno .LBB16_10 -; AVX1-NEXT: # %bb.9: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB16_10: -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx -; AVX1-NEXT: vpextrb $10, %xmm0, %r15d -; AVX1-NEXT: movl %r15d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r15b -; AVX1-NEXT: jno .LBB16_12 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: .LBB16_12: -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx -; AVX1-NEXT: vpextrb $9, %xmm0, %r12d -; AVX1-NEXT: movl %r12d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r12b -; AVX1-NEXT: jno .LBB16_14 -; AVX1-NEXT: # %bb.13: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: .LBB16_14: -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx -; AVX1-NEXT: vpextrb $8, %xmm0, %r13d -; AVX1-NEXT: movl %r13d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r13b -; AVX1-NEXT: jno .LBB16_16 -; AVX1-NEXT: # %bb.15: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: .LBB16_16: -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx -; AVX1-NEXT: vpextrb $7, %xmm0, %r10d -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r10b -; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB16_18 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r10d -; AVX1-NEXT: .LBB16_18: -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx -; AVX1-NEXT: vpextrb $6, %xmm0, %r9d -; AVX1-NEXT: movl %r9d, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %r9b -; AVX1-NEXT: jno .LBB16_20 -; AVX1-NEXT: # %bb.19: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: .LBB16_20: -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx -; AVX1-NEXT: vpextrb $5, %xmm0, %ebp -; AVX1-NEXT: movl %ebp, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %bpl -; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB16_22 -; AVX1-NEXT: # %bb.21: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: .LBB16_22: -; AVX1-NEXT: vpextrb $4, %xmm1, %ecx -; AVX1-NEXT: vpextrb $4, %xmm0, %edi -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: setns %al -; AVX1-NEXT: subb %cl, %dil -; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: jno .LBB16_24 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: addb $127, %al -; AVX1-NEXT: movl %eax, %edi -; AVX1-NEXT: .LBB16_24: -; AVX1-NEXT: vpextrb $3, %xmm1, %edx -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: setns %cl -; AVX1-NEXT: subb %dl, %al -; AVX1-NEXT: jno .LBB16_26 -; AVX1-NEXT: # %bb.25: -; AVX1-NEXT: addb $127, %cl -; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: .LBB16_26: -; AVX1-NEXT: vpextrb $2, %xmm1, %ebx -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: subb %bl, %dl -; AVX1-NEXT: setns %dl -; AVX1-NEXT: subb %bl, %cl -; AVX1-NEXT: jno .LBB16_28 -; AVX1-NEXT: # %bb.27: -; AVX1-NEXT: addb $127, %dl -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: .LBB16_28: -; AVX1-NEXT: vpextrb $0, %xmm1, %esi -; AVX1-NEXT: vpextrb $0, %xmm0, %edx -; AVX1-NEXT: movl %edx, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %dl -; AVX1-NEXT: jno .LBB16_30 -; AVX1-NEXT: # %bb.29: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %edx -; AVX1-NEXT: .LBB16_30: -; AVX1-NEXT: vpextrb $1, %xmm1, %esi -; AVX1-NEXT: vpextrb $1, %xmm0, %r8d -; AVX1-NEXT: movl %r8d, %ebx -; AVX1-NEXT: subb %sil, %bl -; AVX1-NEXT: setns %bl -; AVX1-NEXT: subb %sil, %r8b -; AVX1-NEXT: jno .LBB16_32 -; AVX1-NEXT: # %bb.31: -; AVX1-NEXT: addb $127, %bl -; AVX1-NEXT: movl %ebx, %r8d -; AVX1-NEXT: .LBB16_32: -; AVX1-NEXT: movzbl %dl, %edx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: movzbl %r8b, %edx -; AVX1-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %al, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %dil, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %bpl, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r9b, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r10b, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r13b, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r12b, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r15b, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r14b, %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl %r11b, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: @@ -18833,604 +629,21 @@ ; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx ; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %edx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dl -; AVX2-NEXT: jno .LBB16_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: .LBB16_2: -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r11b -; AVX2-NEXT: jno .LBB16_4 -; AVX2-NEXT: # %bb.3: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: .LBB16_4: -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: jno .LBB16_6 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB16_6: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, %r14d -; AVX2-NEXT: movl %r14d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r14b -; AVX2-NEXT: jno .LBB16_8 -; AVX2-NEXT: # %bb.7: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: .LBB16_8: -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: jno .LBB16_10 -; AVX2-NEXT: # %bb.9: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB16_10: -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, %r15d -; AVX2-NEXT: movl %r15d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r15b -; AVX2-NEXT: jno .LBB16_12 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: .LBB16_12: -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, %r12d -; AVX2-NEXT: movl %r12d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r12b -; AVX2-NEXT: jno .LBB16_14 -; AVX2-NEXT: # %bb.13: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: .LBB16_14: -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, %r13d -; AVX2-NEXT: movl %r13d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r13b -; AVX2-NEXT: jno .LBB16_16 -; AVX2-NEXT: # %bb.15: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: .LBB16_16: -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, %r10d -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r10b -; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB16_18 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r10d -; AVX2-NEXT: .LBB16_18: -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, %r9d -; AVX2-NEXT: movl %r9d, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %r9b -; AVX2-NEXT: jno .LBB16_20 -; AVX2-NEXT: # %bb.19: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: .LBB16_20: -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, %ebp -; AVX2-NEXT: movl %ebp, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %bpl -; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB16_22 -; AVX2-NEXT: # %bb.21: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: .LBB16_22: -; AVX2-NEXT: vpextrb $4, %xmm1, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, %edi -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: setns %al -; AVX2-NEXT: subb %cl, %dil -; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: jno .LBB16_24 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: addb $127, %al -; AVX2-NEXT: movl %eax, %edi -; AVX2-NEXT: .LBB16_24: -; AVX2-NEXT: vpextrb $3, %xmm1, %edx -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: setns %cl -; AVX2-NEXT: subb %dl, %al -; AVX2-NEXT: jno .LBB16_26 -; AVX2-NEXT: # %bb.25: -; AVX2-NEXT: addb $127, %cl -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: .LBB16_26: -; AVX2-NEXT: vpextrb $2, %xmm1, %ebx -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: subb %bl, %dl -; AVX2-NEXT: setns %dl -; AVX2-NEXT: subb %bl, %cl -; AVX2-NEXT: jno .LBB16_28 -; AVX2-NEXT: # %bb.27: -; AVX2-NEXT: addb $127, %dl -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: .LBB16_28: -; AVX2-NEXT: vpextrb $0, %xmm1, %esi -; AVX2-NEXT: vpextrb $0, %xmm0, %edx -; AVX2-NEXT: movl %edx, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %dl -; AVX2-NEXT: jno .LBB16_30 -; AVX2-NEXT: # %bb.29: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %edx -; AVX2-NEXT: .LBB16_30: -; AVX2-NEXT: vpextrb $1, %xmm1, %esi -; AVX2-NEXT: vpextrb $1, %xmm0, %r8d -; AVX2-NEXT: movl %r8d, %ebx -; AVX2-NEXT: subb %sil, %bl -; AVX2-NEXT: setns %bl -; AVX2-NEXT: subb %sil, %r8b -; AVX2-NEXT: jno .LBB16_32 -; AVX2-NEXT: # %bb.31: -; AVX2-NEXT: addb $127, %bl -; AVX2-NEXT: movl %ebx, %r8d -; AVX2-NEXT: .LBB16_32: -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: movzbl %r8b, %edx -; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %bpl, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r9b, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r10b, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r13b, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r12b, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r15b, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r14b, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl %r11b, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k1 -; AVX512-NEXT: kmovd %k1, %edx ; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 ; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: movl %edx, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %dl -; AVX512-NEXT: kmovd %k0, %esi -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: jno .LBB16_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: .LBB16_2: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %sil -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_4 -; AVX512-NEXT: # %bb.3: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %esi -; AVX512-NEXT: .LBB16_4: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: movl %edi, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %dil -; AVX512-NEXT: kshiftrw $3, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r11d -; AVX512-NEXT: kshiftrw $3, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_6 -; AVX512-NEXT: # %bb.5: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %edi -; AVX512-NEXT: .LBB16_6: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r11b -; AVX512-NEXT: movl %r11d, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %r11b -; AVX512-NEXT: kshiftrw $4, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r14d -; AVX512-NEXT: kshiftrw $4, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_8 -; AVX512-NEXT: # %bb.7: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r11d -; AVX512-NEXT: .LBB16_8: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r14b -; AVX512-NEXT: movl %r14d, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %r14b -; AVX512-NEXT: kshiftrw $5, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r15d -; AVX512-NEXT: kshiftrw $5, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_10 -; AVX512-NEXT: # %bb.9: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r14d -; AVX512-NEXT: .LBB16_10: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r15b -; AVX512-NEXT: movl %r15d, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %r15b -; AVX512-NEXT: kshiftrw $6, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r12d -; AVX512-NEXT: kshiftrw $6, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_12 -; AVX512-NEXT: # %bb.11: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r15d -; AVX512-NEXT: .LBB16_12: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r12b -; AVX512-NEXT: movl %r12d, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %r12b -; AVX512-NEXT: kshiftrw $7, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r13d -; AVX512-NEXT: kshiftrw $7, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_14 -; AVX512-NEXT: # %bb.13: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r12d -; AVX512-NEXT: .LBB16_14: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r13b -; AVX512-NEXT: movl %r13d, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %r13b -; AVX512-NEXT: kshiftrw $8, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r9d -; AVX512-NEXT: kshiftrw $8, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_16 -; AVX512-NEXT: # %bb.15: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r13d -; AVX512-NEXT: .LBB16_16: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r9b -; AVX512-NEXT: movl %r9d, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %r9b -; AVX512-NEXT: kshiftrw $9, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r10d -; AVX512-NEXT: kshiftrw $9, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: jno .LBB16_18 -; AVX512-NEXT: # %bb.17: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: .LBB16_18: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %r10b -; AVX512-NEXT: movl %r10d, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %r10b -; AVX512-NEXT: kshiftrw $10, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %ebp -; AVX512-NEXT: kshiftrw $10, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB16_20 -; AVX512-NEXT: # %bb.19: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %r10d -; AVX512-NEXT: .LBB16_20: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %bpl -; AVX512-NEXT: movl %ebp, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %cl -; AVX512-NEXT: subb %al, %bpl -; AVX512-NEXT: kshiftrw $11, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edi -; AVX512-NEXT: kshiftrw $11, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB16_22 -; AVX512-NEXT: # %bb.21: -; AVX512-NEXT: addb $127, %cl -; AVX512-NEXT: movl %ecx, %ebp -; AVX512-NEXT: .LBB16_22: -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: shlb $7, %dil -; AVX512-NEXT: movl %edi, %ecx -; AVX512-NEXT: subb %al, %cl -; AVX512-NEXT: setns %dl -; AVX512-NEXT: subb %al, %dil -; AVX512-NEXT: kshiftrw $12, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: kshiftrw $12, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %ecx -; AVX512-NEXT: jno .LBB16_24 -; AVX512-NEXT: # %bb.23: -; AVX512-NEXT: addb $127, %dl -; AVX512-NEXT: movl %edx, %edi -; AVX512-NEXT: .LBB16_24: -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: shlb $7, %al -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: subb %cl, %dl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %cl, %al -; AVX512-NEXT: kshiftrw $13, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %ecx -; AVX512-NEXT: kshiftrw $13, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: jno .LBB16_26 -; AVX512-NEXT: # %bb.25: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %eax -; AVX512-NEXT: .LBB16_26: -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: shlb $7, %cl -; AVX512-NEXT: movl %ecx, %ebx -; AVX512-NEXT: subb %dl, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %dl, %cl -; AVX512-NEXT: kshiftrw $14, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: kshiftrw $14, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %esi -; AVX512-NEXT: jno .LBB16_28 -; AVX512-NEXT: # %bb.27: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %ecx -; AVX512-NEXT: .LBB16_28: -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: shlb $7, %dl -; AVX512-NEXT: movl %edx, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %dl -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kmovd %k0, %r8d -; AVX512-NEXT: kshiftrw $15, %k1, %k0 -; AVX512-NEXT: kmovd %k0, %esi -; AVX512-NEXT: jno .LBB16_30 -; AVX512-NEXT: # %bb.29: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %edx -; AVX512-NEXT: .LBB16_30: -; AVX512-NEXT: shlb $7, %sil -; AVX512-NEXT: shlb $7, %r8b -; AVX512-NEXT: movl %r8d, %ebx -; AVX512-NEXT: subb %sil, %bl -; AVX512-NEXT: setns %bl -; AVX512-NEXT: subb %sil, %r8b -; AVX512-NEXT: jno .LBB16_32 -; AVX512-NEXT: # %bb.31: -; AVX512-NEXT: addb $127, %bl -; AVX512-NEXT: movl %ebx, %r8d -; AVX512-NEXT: .LBB16_32: -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; AVX512-NEXT: sarb $7, %sil -; AVX512-NEXT: kmovd %esi, %k1 -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; AVX512-NEXT: sarb $7, %sil -; AVX512-NEXT: kmovd %esi, %k0 -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; AVX512-NEXT: sarb $7, %sil -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: sarb $7, %r11b -; AVX512-NEXT: kmovd %r11d, %k3 -; AVX512-NEXT: sarb $7, %r14b -; AVX512-NEXT: kmovd %r14d, %k4 -; AVX512-NEXT: sarb $7, %r15b -; AVX512-NEXT: kmovd %r15d, %k5 -; AVX512-NEXT: sarb $7, %r12b -; AVX512-NEXT: kmovd %r12d, %k6 -; AVX512-NEXT: kshiftrw $1, %k0, %k7 -; AVX512-NEXT: kxorw %k1, %k7, %k7 -; AVX512-NEXT: sarb $7, %r13b -; AVX512-NEXT: kmovd %r13d, %k1 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $14, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k0, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k7 -; AVX512-NEXT: kxorw %k2, %k7, %k7 -; AVX512-NEXT: sarb $7, %r9b -; AVX512-NEXT: kmovd %r9d, %k2 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $13, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k0, %k0 -; AVX512-NEXT: kshiftrw $3, %k0, %k7 -; AVX512-NEXT: kxorw %k3, %k7, %k7 -; AVX512-NEXT: sarb $7, %r10b -; AVX512-NEXT: kmovd %r10d, %k3 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $12, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k0, %k7 -; AVX512-NEXT: kshiftrw $4, %k7, %k0 -; AVX512-NEXT: kxorw %k4, %k0, %k4 -; AVX512-NEXT: sarb $7, %bpl -; AVX512-NEXT: kmovd %ebp, %k0 -; AVX512-NEXT: kshiftlw $15, %k4, %k4 -; AVX512-NEXT: kshiftrw $11, %k4, %k4 -; AVX512-NEXT: kxorw %k4, %k7, %k7 -; AVX512-NEXT: kshiftrw $5, %k7, %k4 -; AVX512-NEXT: kxorw %k5, %k4, %k5 -; AVX512-NEXT: sarb $7, %dil -; AVX512-NEXT: kmovd %edi, %k4 -; AVX512-NEXT: kshiftlw $15, %k5, %k5 -; AVX512-NEXT: kshiftrw $10, %k5, %k5 -; AVX512-NEXT: kxorw %k5, %k7, %k7 -; AVX512-NEXT: kshiftrw $6, %k7, %k5 -; AVX512-NEXT: kxorw %k6, %k5, %k6 -; AVX512-NEXT: sarb $7, %al -; AVX512-NEXT: kmovd %eax, %k5 -; AVX512-NEXT: kshiftlw $15, %k6, %k6 -; AVX512-NEXT: kshiftrw $9, %k6, %k6 -; AVX512-NEXT: kxorw %k6, %k7, %k6 -; AVX512-NEXT: kshiftrw $7, %k6, %k7 -; AVX512-NEXT: kxorw %k1, %k7, %k7 -; AVX512-NEXT: sarb $7, %cl -; AVX512-NEXT: kmovd %ecx, %k1 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $8, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k6, %k6 -; AVX512-NEXT: kshiftrw $8, %k6, %k7 -; AVX512-NEXT: kxorw %k2, %k7, %k7 -; AVX512-NEXT: sarb $7, %dl -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kshiftlw $15, %k7, %k7 -; AVX512-NEXT: kshiftrw $7, %k7, %k7 -; AVX512-NEXT: kxorw %k7, %k6, %k6 -; AVX512-NEXT: kshiftrw $9, %k6, %k7 -; AVX512-NEXT: kxorw %k3, %k7, %k3 -; AVX512-NEXT: sarb $7, %r8b -; AVX512-NEXT: kmovd %r8d, %k7 -; AVX512-NEXT: kshiftlw $15, %k3, %k3 -; AVX512-NEXT: kshiftrw $6, %k3, %k3 -; AVX512-NEXT: kxorw %k3, %k6, %k3 -; AVX512-NEXT: kshiftrw $10, %k3, %k6 -; AVX512-NEXT: kxorw %k0, %k6, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $5, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k3, %k0 -; AVX512-NEXT: kshiftrw $11, %k0, %k3 -; AVX512-NEXT: kxorw %k4, %k3, %k3 -; AVX512-NEXT: kshiftlw $15, %k3, %k3 -; AVX512-NEXT: kshiftrw $4, %k3, %k3 -; AVX512-NEXT: kxorw %k3, %k0, %k0 -; AVX512-NEXT: kshiftrw $12, %k0, %k3 -; AVX512-NEXT: kxorw %k5, %k3, %k3 -; AVX512-NEXT: kshiftlw $15, %k3, %k3 -; AVX512-NEXT: kshiftrw $3, %k3, %k3 -; AVX512-NEXT: kxorw %k3, %k0, %k0 -; AVX512-NEXT: kshiftrw $13, %k0, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k1 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 -; AVX512-NEXT: kshiftlw $15, %k7, %k1 -; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kandnw %k0, %k1, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z