Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -6932,6 +6932,37 @@
   return Zeroable;
 }
 
+/// \brief Compute whether each element of a target shuffle is known to be zero.
+static SmallBitVector computeKnownZeroShuffleElements(ArrayRef<int> Mask,
+                                                      SDValue V1, SDValue V2) {
+  SmallBitVector KnownZero(Mask.size(), false);
+
+  while (V1.getOpcode() == ISD::BITCAST)
+    V1 = V1->getOperand(0);
+  while (V2.getOpcode() == ISD::BITCAST)
+    V2 = V2->getOperand(0);
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    // Handle the easy case.
+    if (M == SM_SentinelZero) {
+      KnownZero[i] = true;
+      continue;
+    }
+
+    // If this is an index into a build_vector node (which has the same number
+    // of elements), dig out the input value and use it.
+    SDValue V = M < Size ? V1 : V2;
+    if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+      continue;
+
+    SDValue Input = V.getOperand(M % Size);
+    KnownZero[i] = X86::isZeroNode(Input);
+  }
+
+  return KnownZero;
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
@@ -23568,6 +23599,30 @@
   return true;
 }
 
+/// Check a target shuffle mask's inputs to see if we can set any values to
+/// SM_SentinelZero - this is for elements that must be set to zero,
+/// not just zeroable.
+static bool ResolveTargetShuffleZeroElts(SDValue N,
+                                         SmallVectorImpl<int> &Mask) {
+  if (!isTargetShuffle(N.getOpcode()))
+    return false;
+
+  bool IsUnary;
+  if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Mask,
+                            IsUnary))
+    return false;
+
+  SDValue V1 = N.getOperand(0);
+  SDValue V2 = IsUnary ? V1 : N.getOperand(1);
+  SmallBitVector KnownZero = computeKnownZeroShuffleElements(Mask, V1, V2);
+
+  for (int i = 0, e = Mask.size(); i != e; ++i)
+    if (KnownZero[i])
+      Mask[i] = SM_SentinelZero;
+
+  return true;
+}
+
 /// \brief Try to combine x86 target specific shuffles.
 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
@@ -23641,6 +23696,96 @@
           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
         }
 
+    // Attempt to merge blend(insertps(x,y),zero).
+    if (V0.getOpcode() == X86ISD::INSERTPS ||
+        V1.getOpcode() == X86ISD::INSERTPS) {
+      assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+
+      // Determine which elements are known to be zero.
+      SmallVector<int, 8> TargetMask;
+      if (!ResolveTargetShuffleZeroElts(N, TargetMask))
+        return SDValue();
+
+      // Helper function to take inner insertps node and attempt to
+      // merge the blend with zero into its zero mask.
+      auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
+        if (V.getOpcode() != X86ISD::INSERTPS)
+          return SDValue();
+        SDValue Op0 = V.getOperand(0);
+        SDValue Op1 = V.getOperand(1);
+        SDValue Op2 = V.getOperand(2);
+        unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+
+        // Check each element of the blend node's target mask - must either
+        // be zeroable (and update the zero mask) or selects the element from
+        // the inner insertps node.
+        for (int i = 0; i != 4; ++i)
+          if (TargetMask[i] < 0)
+            InsertPSMask |= (1u << i);
+          else if (TargetMask[i] != (i + Offset))
+            return SDValue();
+        return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
+                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
+      };
+
+      if (SDValue V = MergeInsertPSAndBlend(V0, 0))
+        return V;
+      if (SDValue V = MergeInsertPSAndBlend(V1, 4))
+        return V;
+    }
+    return SDValue();
+  }
+  case X86ISD::INSERTPS: {
+    assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
+    SDValue Op2 = N.getOperand(2);
+    unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+    unsigned DstIdx = (InsertPSMask >> 4) & 3;
+
+    // Attempt to merge insertps with an inner target shuffle node.
+    SmallVector<int, 8> TargetMask;
+    if (!ResolveTargetShuffleZeroElts(Op0, TargetMask))
+      return SDValue();
+
+    bool Updated = false;
+    bool UseInput00 = false;
+    bool UseInput01 = false;
+    for (int i = 0; i != 4; ++i) {
+      int M = TargetMask[i];
+      if ((InsertPSMask & (1u << i)) || (i == DstIdx)) {
+        // No change if element is already zero or the inserted element.
+        continue;
+      } else if (M < 0) {
+        // If the target mask is undef/zero then we must zero the element.
+        InsertPSMask |= (1u << i);
+        Updated = true;
+        continue;
+      }
+
+      // The input vector element must be inline.
+      if (M != i && M != (i + 4))
+        return SDValue();
+
+      // Determine which inputs of the target shuffle we're using.
+      UseInput00 |= (0 <= M && M < 4);
+      UseInput01 |= (4 <= M);
+    }
+
+    // If we're not using both inputs of the target shuffle then use the
+    // referenced input directly.
+    if (UseInput00 && !UseInput01) {
+      Updated = true;
+      Op0 = Op0.getOperand(0);
+    } else if (!UseInput00 && UseInput01) {
+      Updated = true;
+      Op0 = Op0.getOperand(1);
+    }
+
+    if (Updated)
+      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
     return SDValue();
   }
   default:
@@ -27998,6 +28143,7 @@
   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
+  case X86ISD::INSERTPS:
   case X86ISD::PALIGNR:
   case X86ISD::BLENDI:
   case X86ISD::UNPCKH:
Index: test/CodeGen/X86/insertps-combine.ll
===================================================================
--- test/CodeGen/X86/insertps-combine.ll
+++ test/CodeGen/X86/insertps-combine.ll
@@ -6,16 +6,12 @@
 define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
 ; SSE-LABEL: shuffle_v4f32_0z27:
 ; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4f32_0z27:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
 ; AVX-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -50,16 +46,12 @@
 define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
 ; SSE-LABEL: shuffle_v4f32_0z24:
 ; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4f32_0z24:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
 ; AVX-NEXT:    retq
   %vecext = extractelement <4 x float> %xyzw, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
Index: test/CodeGen/X86/vector-shuffle-128-v4.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1080,15 +1080,11 @@
 ; SSE41-LABEL: shuffle_v4f32_0zz6:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4f32_0zz6:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
   %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
@@ -1129,15 +1125,11 @@
 ; SSE41-LABEL: shuffle_v4f32_0z24:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4f32_0z24:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
   %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>