Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -5779,14 +5779,133 @@
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
 }
 
-/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+                                         SDValue ExtIdx) {
+  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+    return Idx;
+
+  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+  // lowered this:
+  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+  // to:
+  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
+  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
+  //                           undef)
+  //                       Constant<0>)
+  // In this case the vector is the extract_subvector expression and the index
+  // is 2, as specified by the shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+  SDValue ShuffleVec = SVOp->getOperand(0);
+  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+  assert(ShuffleVecVT.getVectorElementType() ==
+         ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+  int ShuffleIdx = SVOp->getMaskElt(Idx);
+  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+    ExtractedFromVec = ShuffleVec;
+    return ShuffleIdx;
+  }
+  return Idx;
+}
+
+/// \brief - If BUILD_VECTOR uses several extractelts and few insertions (currently
+/// one), transform it into a SHUFFLE + INSERT_VECTOR_ELT.
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  // Skip if insert_vec_elt is not supported.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+    return SDValue();
+
+  SDLoc DL(Op);
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
+
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
+
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
+      return SDValue();
+
+    if (!VecIn1.getNode())
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (!VecIn2.getNode())
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
+
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
+  }
+
+  if (!VecIn1.getNode())
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx));
+  }
+
+  return NV;
+}
+
+
+/// \brief Custom lower build_vector of v8i16. Couple of options available:
+///
+/// (1) More than half non zero elts without SSE2 fallback to a likely series
+///     of unpck*.
+/// (2) With SSE2 support use several insert_vector_elts to get PINSRWs.
+/// (3) Use a shuffle.
 ///
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
                                      const X86Subtarget* Subtarget,
                                      const TargetLowering &TLI) {
-  if (NumNonZero > 4)
+  SDValue Sh = buildFromShuffleMostly(Op, DAG);
+  if (Sh.getNode())
+    return Sh;
+
+  if (NumNonZero > 4 && !Subtarget->hasSSE2())
     return SDValue();
 
   SDLoc dl(Op);
@@ -6271,113 +6390,6 @@
   return SDValue();
 }
 
-/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
-/// underlying vector and index.
-///
-/// Modifies \p ExtractedFromVec to the real vector and returns the real
-/// index.
-static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
-                                         SDValue ExtIdx) {
-  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
-  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
-    return Idx;
-
-  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
-  // lowered this:
-  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
-  // to:
-  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
-  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
-  //                           undef)
-  //                       Constant<0>)
-  // In this case the vector is the extract_subvector expression and the index
-  // is 2, as specified by the shuffle.
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
-  SDValue ShuffleVec = SVOp->getOperand(0);
-  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
-  assert(ShuffleVecVT.getVectorElementType() ==
-         ExtractedFromVec.getSimpleValueType().getVectorElementType());
-
-  int ShuffleIdx = SVOp->getMaskElt(Idx);
-  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
-    ExtractedFromVec = ShuffleVec;
-    return ShuffleIdx;
-  }
-  return Idx;
-}
-
-static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-
-  // Skip if insert_vec_elt is not supported.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
-    return SDValue();
-
-  SDLoc DL(Op);
-  unsigned NumElems = Op.getNumOperands();
-
-  SDValue VecIn1;
-  SDValue VecIn2;
-  SmallVector<unsigned, 4> InsertIndices;
-  SmallVector<int, 8> Mask(NumElems, -1);
-
-  for (unsigned i = 0; i != NumElems; ++i) {
-    unsigned Opc = Op.getOperand(i).getOpcode();
-
-    if (Opc == ISD::UNDEF)
-      continue;
-
-    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
-      // Quit if more than 1 elements need inserting.
-      if (InsertIndices.size() > 1)
-        return SDValue();
-
-      InsertIndices.push_back(i);
-      continue;
-    }
-
-    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
-    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
-    // Quit if non-constant index.
-    if (!isa<ConstantSDNode>(ExtIdx))
-      return SDValue();
-    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
-
-    // Quit if extracted from vector of different type.
-    if (ExtractedFromVec.getValueType() != VT)
-      return SDValue();
-
-    if (!VecIn1.getNode())
-      VecIn1 = ExtractedFromVec;
-    else if (VecIn1 != ExtractedFromVec) {
-      if (!VecIn2.getNode())
-        VecIn2 = ExtractedFromVec;
-      else if (VecIn2 != ExtractedFromVec)
-        // Quit if more than 2 vectors to shuffle
-        return SDValue();
-    }
-
-    if (ExtractedFromVec == VecIn1)
-      Mask[i] = Idx;
-    else if (ExtractedFromVec == VecIn2)
-      Mask[i] = Idx + NumElems;
-  }
-
-  if (!VecIn1.getNode())
-    return SDValue();
-
-  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
-  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
-  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
-    unsigned Idx = InsertIndices[i];
-    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
-                     DAG.getIntPtrConstant(Idx));
-  }
-
-  return NV;
-}
-
 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
 SDValue
 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
Index: test/CodeGen/X86/vec_set.ll
===================================================================
--- test/CodeGen/X86/vec_set.ll
+++ test/CodeGen/X86/vec_set.ll
@@ -1,15 +1,27 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep punpckl | count 7
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck %s
 
-define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
-        %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0          ; <<8 x i16>> [#uses=1]
-        %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1            ; <<8 x i16>> [#uses=1]
-        %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2           ; <<8 x i16>> [#uses=1]
-        %tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3           ; <<8 x i16>> [#uses=1]
-        %tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4           ; <<8 x i16>> [#uses=1]
-        %tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5          ; <<8 x i16>> [#uses=1]
-        %tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6         ; <<8 x i16>> [#uses=1]
-        %tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7         ; <<8 x i16>> [#uses=1]
-        store <8 x i16> %tmp14, <8 x i16>* %b
-        ret void
+; CHECK-LABEL: test0
+define void @test0(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
+; CHECK-LABEL: test0:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pinsrw $0, %esi, %xmm0
+; CHECK-NEXT:    pinsrw $1, %edx, %xmm0
+; CHECK-NEXT:    pinsrw $2, %ecx, %xmm0
+; CHECK-NEXT:    pinsrw $3, %r8d, %xmm0
+; CHECK-NEXT:    pinsrw $4, %r9d, %xmm0
+; CHECK-NEXT:    pinsrw $5, {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    pinsrw $6, {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    pinsrw $7, {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    movdqa %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0
+  %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2
+  %tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3
+  %tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4
+  %tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5
+  %tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6
+  %tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7
+  store <8 x i16> %tmp14, <8 x i16>* %b
+  ret void
 }
-