Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7147,51 +7147,51 @@ SDLoc dl(Op); SDValue V; - bool First = true; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. - for (unsigned i = 0; i < 16; ++i) { + for (unsigned i = 0; i < 16; i += 2) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; - if (ThisIsNonZero && First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0; + if (!ThisIsNonZero && !NextIsNonZero) + continue; + + // FIXME: Investigate combining the first 4 bytes as a i32 instead. + SDValue Elt; + if (ThisIsNonZero) { + if (NumZero || NextIsNonZero) + Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); else - V = DAG.getUNDEF(MVT::v8i16); - First = false; + Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); } - if ((i & 1) != 0) { - // FIXME: Investigate extending to i32 instead of just i16. - // FIXME: Investigate combining the first 4 bytes as a i32 instead. - SDValue ThisElt, LastElt; - bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; - if (LastIsNonZero) { - LastElt = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); - } - if (ThisIsNonZero) { - ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, - DAG.getConstant(8, dl, MVT::i8)); - if (LastIsNonZero) - ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); - } else - ThisElt = LastElt; - - if (ThisElt) { - if (1 == i) { - V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) - : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - if (NumZero) - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); - V = DAG.getBitcast(MVT::v8i16, V); - } else { - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i / 2, dl)); - } + if (NextIsNonZero) { + SDValue NextElt; + if (i == 0 && NumZero) + NextElt = DAG.getZExtOrTrunc(Op.getOperand(i+1), dl, MVT::i32); + else + NextElt = DAG.getAnyExtOrTrunc(Op.getOperand(i+1), dl, MVT::i32); + NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, + DAG.getConstant(8, dl, MVT::i8)); + if (ThisIsNonZero) + Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); + else + Elt = NextElt; + } + + // If our first insertion is not the first index then insert into zero + // vector to break any register dependency else use SCALAR_TO_VECTOR. + if (!V) { + if (i != 0) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); + V = DAG.getBitcast(MVT::v8i16, V); + continue; } } + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, + DAG.getIntPtrConstant(i / 2, dl)); } return DAG.getBitcast(MVT::v16i8, V); Index: llvm/test/CodeGen/X86/buildvec-insertvec.ll =================================================================== --- llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -413,16 +413,13 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) { ; SSE2-LABEL: test_buildvector_v16i8_partial: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 -; SSE2-NEXT: movzbl %sil, %eax -; SSE2-NEXT: pinsrw $3, %eax, %xmm0 -; SSE2-NEXT: movzbl %dl, %eax -; SSE2-NEXT: pinsrw $4, %eax, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $1, %edi, %xmm0 +; SSE2-NEXT: pinsrw $3, %esi, %xmm0 +; SSE2-NEXT: pinsrw $4, %edx, %xmm0 ; SSE2-NEXT: shll $8, %ecx ; SSE2-NEXT: pinsrw $5, %ecx, %xmm0 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: pinsrw $6, %r8d, %xmm0 ; SSE2-NEXT: shll $8, %r9d ; SSE2-NEXT: pinsrw $7, %r9d, %xmm0 ; SSE2-NEXT: retq Index: llvm/test/CodeGen/X86/promote-vec3.ll =================================================================== --- llvm/test/CodeGen/X86/promote-vec3.ll +++ llvm/test/CodeGen/X86/promote-vec3.ll @@ -70,11 +70,10 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; SSE3-LABEL: sext_i8: ; SSE3: # %bb.0: -; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: pinsrw $1, %eax, %xmm0 +; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE3-NEXT: pinsrw $2, %eax, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: psraw $8, %xmm0 Index: llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -2045,8 +2045,7 @@ ; SSE2-NEXT: movzbl (%rsi), %ecx ; SSE2-NEXT: shll $8, %ecx ; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movzwl %cx, %eax -; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7] Index: llvm/test/CodeGen/X86/widen_conv-3.ll =================================================================== --- llvm/test/CodeGen/X86/widen_conv-3.ll +++ llvm/test/CodeGen/X86/widen_conv-3.ll @@ -51,6 +51,7 @@ ; X86-SSE2-NEXT: movl (%esp), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE2-NEXT: shll $8, %edx +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 ; X86-SSE2-NEXT: pinsrw $1, %edx, %xmm0 ; X86-SSE2-NEXT: shll $8, %esi ; X86-SSE2-NEXT: pinsrw $3, %esi, %xmm0 @@ -99,6 +100,7 @@ ; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx ; X64-SSE2-NEXT: shll $8, %eax +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 ; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0 ; X64-SSE2-NEXT: shll $8, %ecx ; X64-SSE2-NEXT: pinsrw $3, %ecx, %xmm0