Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -5513,6 +5513,68 @@
   return DAG.getBitcast(VT, Result);
 }
 
+/// Attempts to lower a BUILD_VECTOR to a BLEND
+// Possible if all operands are constants or UNDEF's, except for the first.
+// build_vector(X, C0, C1, C2) ->
+// BLEND(scalar_to_vector(X), const_pool_vec(Ud, C0, C1 C2), 0111)
+static SDValue LowerBuildVectorAsBLEND(SDValue BV, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
+  const TargetLowering &TLI = *Subtarget.getTargetLowering();
+  EVT VT = BV.getValueType();
+  EVT EltVT = VT.getScalarType();
+  unsigned NumElts = BV.getNumOperands();
+  SDLoc DL(BV);
+
+  if (VT != MVT::v4f32 && VT != MVT::v4i32)
+    return {};
+
+  // Check that the BUILD_VECTOR's last N-1 operands are constants
+  for (unsigned i = 1; i != NumElts; ++i) {
+    const SDValue &Op = BV.getOperand(i);
+    if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
+      return {};
+  }
+
+  // Build the vector of constants which will be loaded from the constant pool
+  SmallVector<Constant *, 4> CV;
+  CV.reserve(NumElts);
+  // First element is an UNDEF since it will not be selected
+  CV.push_back(UndefValue::get(EltVT.getTypeForEVT(*DAG.getContext())));
+  // Remaining elements will be selected.
+  for (unsigned i = 1; i != NumElts; ++i) {
+    if (ConstantFPSDNode *V = dyn_cast<ConstantFPSDNode>(BV->getOperand(i))) {
+      CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
+    } else if (ConstantSDNode *V =
+                   dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
+      CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+    } else {
+      assert(BV->getOperand(i).isUndef());
+      Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
+      CV.push_back(UndefValue::get(OpNTy));
+    }
+  }
+  Constant *CP = ConstantVector::get(CV);
+  SDValue CPIdx =
+      DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue C = DAG.getLoad(
+      VT, DL, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
+  // The first element is placed into a vector which will be the BLEND's
+  // operand.
+  SDValue Elt0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BV.getOperand(0));
+  // BLEND is natively an operation on float elements, so cast if needed.
+  if (!EltVT.isFloatingPoint()) {
+    VT = MVT::v4f32;
+    C = DAG.getBitcast(VT, C);
+    Elt0 = DAG.getBitcast(VT, Elt0);
+  }
+  // Blend mask = [0, 1, 1, 1]
+  unsigned char BlendMask = 0xE;
+  return DAG.getNode(X86ISD::BLENDI, DL, VT, Elt0, C,
+                     DAG.getConstant(BlendMask, DL, MVT::i8));
+}
+
 /// Return a vector logical shift node.
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
                          SelectionDAG &DAG, const TargetLowering &TLI,
@@ -6874,14 +6936,17 @@
     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
       return Sh;
 
-    // For SSE 4.1, use insertps to put the high elements into the low element.
     if (Subtarget.hasSSE41()) {
       SDValue Result;
-      if (!Op.getOperand(0).isUndef())
+      if (!Op.getOperand(0).isUndef()) {
+        // Attempt to lower as a BLEND
+        if (SDValue Res = LowerBuildVectorAsBLEND(Op, DAG, Subtarget))
+          return Res;
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+      }
       else
         Result = DAG.getUNDEF(VT);
-
+      // use insertps to put the high elements into the low element.
       for (unsigned i = 1; i < NumElems; ++i) {
         if (Op.getOperand(i).isUndef()) continue;
         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Index: test/CodeGen/X86/vector-shuffle-combining.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-combining.ll
+++ test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2859,16 +2859,12 @@
 ;
 ; SSE41-LABEL: combine_constant_insertion_v4f32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_constant_insertion_v4f32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
 ; AVX-NEXT:    retq
   %a0 = insertelement <4 x float> undef, float %f, i32 0
   %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -2907,23 +2903,13 @@
 ; SSE41-LABEL: combine_constant_insertion_v4i32:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    movl $4, %eax
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; SSE41-NEXT:    movl $5, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm0
-; SSE41-NEXT:    movl $30, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_constant_insertion_v4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    movl $4, %eax
-; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movl $5, %eax
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movl $30, %eax
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
 ; AVX-NEXT:    retq
   %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
   %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>