diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17648,6 +17648,40 @@ DAG.getIntPtrConstant(0, dl)); } +// Helper to find all the extracted elements from a vector. +static APInt getExtractedDemandedElts(SDNode *N) { + MVT VT = N->getSimpleValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + APInt DemandedElts = APInt::getZero(NumElts); + for (SDNode *User : N->uses()) { + switch (User->getOpcode()) { + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: + case ISD::EXTRACT_VECTOR_ELT: + if (!isa(User->getOperand(1))) { + DemandedElts.setAllBits(); + return DemandedElts; + } + DemandedElts.setBit(User->getConstantOperandVal(1)); + break; + case ISD::BITCAST: { + if (!User->getValueType(0).isSimple() || + !User->getValueType(0).isVector()) { + DemandedElts.setAllBits(); + return DemandedElts; + } + APInt DemandedSrcElts = getExtractedDemandedElts(User); + DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts); + break; + } + default: + DemandedElts.setAllBits(); + return DemandedElts; + } + } + return DemandedElts; +} + SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { @@ -17739,13 +17773,16 @@ if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - // TODO: We only extract a single element from v16i8, we can probably afford - // to be more aggressive here before using the default approach of spilling to - // stack. - if (VT == MVT::i8 && Op->isOnlyUserOf(Vec.getNode())) { + // Only extract a single element from a v16i8 source - determine the common + // DWORD/WORD that all extractions share, and extract the sub-byte. + // TODO: Add QWORD MOVQ extraction? + if (VT == MVT::i8) { + APInt DemandedElts = getExtractedDemandedElts(Vec.getNode()); + assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch"); + // Extract either the lowest i32 or any i16, and extract the sub-byte. int DWordIdx = IdxVal / 4; - if (DWordIdx == 0) { + if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) { SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), DAG.getIntPtrConstant(DWordIdx, dl)); @@ -17757,14 +17794,16 @@ } int WordIdx = IdxVal / 2; - SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, - DAG.getBitcast(MVT::v8i16, Vec), - DAG.getIntPtrConstant(WordIdx, dl)); - int ShiftVal = (IdxVal % 2) * 8; - if (ShiftVal != 0) - Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, - DAG.getConstant(ShiftVal, dl, MVT::i8)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) { + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + DAG.getBitcast(MVT::v8i16, Vec), + DAG.getIntPtrConstant(WordIdx, dl)); + int ShiftVal = (IdxVal % 2) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, + DAG.getConstant(ShiftVal, dl, MVT::i8)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } } if (VT == MVT::f16 || VT.getSizeInBits() == 32) { diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -214,23 +214,14 @@ } define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { -; SSE2-SSSE3-LABEL: bitcast_v16i8_to_v2i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: movd %eax, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: bitcast_v16i8_to_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovmskb %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: shrl $8, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: bitcast_v16i8_to_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq ; ; AVX12-LABEL: bitcast_v16i8_to_v2i8: ; AVX12: # %bb.0: @@ -447,25 +438,15 @@ } define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: bitcast_v16i16_to_v2i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: movd %eax, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: bitcast_v16i16_to_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: packsswb %xmm1, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: shrl $8, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: bitcast_v16i16_to_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: bitcast_v16i16_to_v2i8: ; AVX1: # %bb.0: @@ -776,29 +757,17 @@ } define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { -; SSE2-SSSE3-LABEL: bitcast_v16i32_to_v2i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: movd %eax, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: bitcast_v16i32_to_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: packssdw %xmm3, %xmm2 -; SSE41-NEXT: packssdw %xmm1, %xmm0 -; SSE41-NEXT: packsswb %xmm2, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %ecx -; SSE41-NEXT: movl %ecx, %eax -; SSE41-NEXT: shrl $8, %eax -; SSE41-NEXT: addb %cl, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: bitcast_v16i32_to_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: bitcast_v16i32_to_v2i8: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll --- a/llvm/test/CodeGen/X86/pr63108.ll +++ b/llvm/test/CodeGen/X86/pr63108.ll @@ -34,9 +34,10 @@ ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: .LBB0_5: # %for.cond.cleanup -; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movsbl %al, %ecx +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: movsbl %al, %eax ; SSE-NEXT: addl %ecx, %eax ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -2434,134 +2434,128 @@ ; ; SSE2-ONLY-LABEL: vec384_v3i8: ; SSE2-ONLY: # %bb.0: -; SSE2-ONLY-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1 -; SSE2-ONLY-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-ONLY-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-ONLY-NEXT: movb %al, 2(%rsi) -; SSE2-ONLY-NEXT: movd %xmm1, %ecx -; SSE2-ONLY-NEXT: movw %cx, (%rsi) -; SSE2-ONLY-NEXT: movb %al, 2(%rdx) -; SSE2-ONLY-NEXT: movw %cx, (%rdx) -; SSE2-ONLY-NEXT: movb %al, 6(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 4(%rdx) -; SSE2-ONLY-NEXT: movb %al, 10(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 8(%rdx) -; SSE2-ONLY-NEXT: movb %al, 14(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 12(%rdx) -; SSE2-ONLY-NEXT: movb %al, 18(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 16(%rdx) -; SSE2-ONLY-NEXT: movb %al, 22(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 20(%rdx) -; SSE2-ONLY-NEXT: movb %al, 26(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 24(%rdx) -; SSE2-ONLY-NEXT: movb %al, 30(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 28(%rdx) -; SSE2-ONLY-NEXT: movb %al, 34(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 32(%rdx) -; SSE2-ONLY-NEXT: movb %al, 38(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 36(%rdx) -; SSE2-ONLY-NEXT: movb %al, 42(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 40(%rdx) -; SSE2-ONLY-NEXT: movb %al, 46(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 44(%rdx) -; SSE2-ONLY-NEXT: movb %al, 50(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 48(%rdx) -; SSE2-ONLY-NEXT: movb %al, 54(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 52(%rdx) -; SSE2-ONLY-NEXT: movb %al, 58(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 56(%rdx) -; SSE2-ONLY-NEXT: movb %al, 62(%rdx) -; SSE2-ONLY-NEXT: movw %cx, 60(%rdx) +; SSE2-ONLY-NEXT: movl (%rdi), %eax +; SSE2-ONLY-NEXT: notl %eax +; SSE2-ONLY-NEXT: movw %ax, (%rsi) +; SSE2-ONLY-NEXT: movl %eax, %ecx +; SSE2-ONLY-NEXT: shrl $16, %ecx +; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) +; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) +; SSE2-ONLY-NEXT: movw %ax, (%rdx) +; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) +; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) +; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) ; SSE2-ONLY-NEXT: retq ; ; SSE3-LABEL: vec384_v3i8: ; SSE3: # %bb.0: -; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movb %al, 2(%rsi) -; SSE3-NEXT: movd %xmm1, %ecx -; SSE3-NEXT: movw %cx, (%rsi) -; SSE3-NEXT: movb %al, 2(%rdx) -; SSE3-NEXT: movw %cx, (%rdx) -; SSE3-NEXT: movb %al, 6(%rdx) -; SSE3-NEXT: movw %cx, 4(%rdx) -; SSE3-NEXT: movb %al, 10(%rdx) -; SSE3-NEXT: movw %cx, 8(%rdx) -; SSE3-NEXT: movb %al, 14(%rdx) -; SSE3-NEXT: movw %cx, 12(%rdx) -; SSE3-NEXT: movb %al, 18(%rdx) -; SSE3-NEXT: movw %cx, 16(%rdx) -; SSE3-NEXT: movb %al, 22(%rdx) -; SSE3-NEXT: movw %cx, 20(%rdx) -; SSE3-NEXT: movb %al, 26(%rdx) -; SSE3-NEXT: movw %cx, 24(%rdx) -; SSE3-NEXT: movb %al, 30(%rdx) -; SSE3-NEXT: movw %cx, 28(%rdx) -; SSE3-NEXT: movb %al, 34(%rdx) -; SSE3-NEXT: movw %cx, 32(%rdx) -; SSE3-NEXT: movb %al, 38(%rdx) -; SSE3-NEXT: movw %cx, 36(%rdx) -; SSE3-NEXT: movb %al, 42(%rdx) -; SSE3-NEXT: movw %cx, 40(%rdx) -; SSE3-NEXT: movb %al, 46(%rdx) -; SSE3-NEXT: movw %cx, 44(%rdx) -; SSE3-NEXT: movb %al, 50(%rdx) -; SSE3-NEXT: movw %cx, 48(%rdx) -; SSE3-NEXT: movb %al, 54(%rdx) -; SSE3-NEXT: movw %cx, 52(%rdx) -; SSE3-NEXT: movb %al, 58(%rdx) -; SSE3-NEXT: movw %cx, 56(%rdx) -; SSE3-NEXT: movb %al, 62(%rdx) -; SSE3-NEXT: movw %cx, 60(%rdx) +; SSE3-NEXT: movl (%rdi), %eax +; SSE3-NEXT: notl %eax +; SSE3-NEXT: movw %ax, (%rsi) +; SSE3-NEXT: movl %eax, %ecx +; SSE3-NEXT: shrl $16, %ecx +; SSE3-NEXT: movb %cl, 2(%rsi) +; SSE3-NEXT: movb %cl, 2(%rdx) +; SSE3-NEXT: movw %ax, (%rdx) +; SSE3-NEXT: movb %cl, 6(%rdx) +; SSE3-NEXT: movw %ax, 4(%rdx) +; SSE3-NEXT: movb %cl, 10(%rdx) +; SSE3-NEXT: movw %ax, 8(%rdx) +; SSE3-NEXT: movb %cl, 14(%rdx) +; SSE3-NEXT: movw %ax, 12(%rdx) +; SSE3-NEXT: movb %cl, 18(%rdx) +; SSE3-NEXT: movw %ax, 16(%rdx) +; SSE3-NEXT: movb %cl, 22(%rdx) +; SSE3-NEXT: movw %ax, 20(%rdx) +; SSE3-NEXT: movb %cl, 26(%rdx) +; SSE3-NEXT: movw %ax, 24(%rdx) +; SSE3-NEXT: movb %cl, 30(%rdx) +; SSE3-NEXT: movw %ax, 28(%rdx) +; SSE3-NEXT: movb %cl, 34(%rdx) +; SSE3-NEXT: movw %ax, 32(%rdx) +; SSE3-NEXT: movb %cl, 38(%rdx) +; SSE3-NEXT: movw %ax, 36(%rdx) +; SSE3-NEXT: movb %cl, 42(%rdx) +; SSE3-NEXT: movw %ax, 40(%rdx) +; SSE3-NEXT: movb %cl, 46(%rdx) +; SSE3-NEXT: movw %ax, 44(%rdx) +; SSE3-NEXT: movb %cl, 50(%rdx) +; SSE3-NEXT: movw %ax, 48(%rdx) +; SSE3-NEXT: movb %cl, 54(%rdx) +; SSE3-NEXT: movw %ax, 52(%rdx) +; SSE3-NEXT: movb %cl, 58(%rdx) +; SSE3-NEXT: movw %ax, 56(%rdx) +; SSE3-NEXT: movb %cl, 62(%rdx) +; SSE3-NEXT: movw %ax, 60(%rdx) ; SSE3-NEXT: retq ; ; SSSE3-ONLY-LABEL: vec384_v3i8: ; SSSE3-ONLY: # %bb.0: -; SSSE3-ONLY-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1 -; SSSE3-ONLY-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-ONLY-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSSE3-ONLY-NEXT: movb %al, 2(%rsi) -; SSSE3-ONLY-NEXT: movd %xmm1, %ecx -; SSSE3-ONLY-NEXT: movw %cx, (%rsi) -; SSSE3-ONLY-NEXT: movb %al, 2(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, (%rdx) -; SSSE3-ONLY-NEXT: movb %al, 6(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 10(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 14(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 18(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 22(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 26(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 30(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 34(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 38(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 42(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 46(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 50(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 54(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 58(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx) -; SSSE3-ONLY-NEXT: movb %al, 62(%rdx) -; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx) +; SSSE3-ONLY-NEXT: movl (%rdi), %eax +; SSSE3-ONLY-NEXT: notl %eax +; SSSE3-ONLY-NEXT: movw %ax, (%rsi) +; SSSE3-ONLY-NEXT: movl %eax, %ecx +; SSSE3-ONLY-NEXT: shrl $16, %ecx +; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) +; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, (%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) +; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) ; SSSE3-ONLY-NEXT: retq ; ; SSE41-LABEL: vec384_v3i8: