Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -5522,6 +5522,61 @@
   return DAG.getBitcast(VT, Result);
 }
 
+/// Attempts to lower a BUILD_VECTOR to a BLEND-matchable pattern.
+// Possible if all operands are constants or UNDEF's, except for the first.
+// build_vector(X, C0, C1, C2) ->
+// vector_shuffle<4,1,2,3>(const_pool_vec(Ud,C0,C1,C2), scalar_to_vector(X))
+static SDValue LowerBuildVectorAsBLEND(SDValue BV, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
+  const TargetLowering &TLI = *Subtarget.getTargetLowering();
+  EVT VT = BV.getValueType();
+  EVT EltVT = VT.getScalarType();
+  unsigned NumElts = BV.getNumOperands();
+  SDLoc DL(BV);
+
+  if (VT != MVT::v4f32 && VT != MVT::v4i32)
+    return {};
+
+  // Check that the BUILD_VECTOR's last N-1 operands are constants
+  for (unsigned i = 1; i != NumElts; ++i) {
+    const SDValue &Op = BV.getOperand(i);
+    if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
+      return {};
+  }
+  // Build the vector of constants which will be loaded from the constant pool
+  // FIXME: It would be better to construct a BUILD_VECTOR node of constants and
+  // undefs and let the legalizer expand it to a constant pool load.
+  // Unfortunately, this does not work because the constant nodes get legaliazed
+  // to individual consant pool load before the BUILD_VECTOR is visited.
+  SmallVector<Constant *, 4> CV;
+  CV.reserve(NumElts);
+  // First element is an UNDEF since it will not be selected
+  CV.push_back(UndefValue::get(EltVT.getTypeForEVT(*DAG.getContext())));
+  // Remaining elements will be selected.
+  for (unsigned i = 1; i != NumElts; ++i) {
+    if (ConstantFPSDNode *V = dyn_cast<ConstantFPSDNode>(BV->getOperand(i))) {
+      CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
+    } else if (ConstantSDNode *V =
+                   dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
+      CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+    } else {
+      assert(BV->getOperand(i).isUndef());
+      Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
+      CV.push_back(UndefValue::get(OpNTy));
+    }
+  }
+  Constant *CP = ConstantVector::get(CV);
+  SDValue CPIdx =
+      DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue C = DAG.getLoad(
+      VT, DL, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
+  SDValue Elt0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BV.getOperand(0));
+  int Mask[] = {4, 1, 2, 3};
+  return DAG.getVectorShuffle(VT, DL, C, Elt0, Mask);
+}
+
 /// Return a vector logical shift node.
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
                          SelectionDAG &DAG, const TargetLowering &TLI,
@@ -6886,9 +6941,12 @@
     // For SSE 4.1, use insertps to put the high elements into the low element.
     if (Subtarget.hasSSE41()) {
       SDValue Result;
-      if (!Op.getOperand(0).isUndef())
+      if (!Op.getOperand(0).isUndef()) {
+        // Attempt to lower as a BLEND
+        if (SDValue Res = LowerBuildVectorAsBLEND(Op, DAG, Subtarget))
+          return Res;
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
-      else
+      } else
         Result = DAG.getUNDEF(VT);
 
       for (unsigned i = 1; i < NumElems; ++i) {
Index: test/CodeGen/X86/vector-shuffle-combining.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-combining.ll
+++ test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2859,16 +2859,12 @@
 ;
 ; SSE41-LABEL: combine_constant_insertion_v4f32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_constant_insertion_v4f32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
 ; AVX-NEXT:    retq
   %a0 = insertelement <4 x float> undef, float %f, i32 0
   %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -2907,24 +2903,20 @@
 ; SSE41-LABEL: combine_constant_insertion_v4i32:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    movl $4, %eax
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; SSE41-NEXT:    movl $5, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm0
-; SSE41-NEXT:    movl $30, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_constant_insertion_v4i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    movl $4, %eax
-; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movl $5, %eax
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movl $30, %eax
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_constant_insertion_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd %edi, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_constant_insertion_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; AVX2-NEXT:    retq
   %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
   %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x i32> %ret