Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -3925,6 +3925,29 @@
   return true;
 }
 
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+  // TODO: Deal with AVX's VINSERTPS
+  if (!VT.is128BitVector() || VT != MVT::v4f32)
+    return false;
+
+  unsigned CorrectPosV1 = 0;
+  unsigned CorrectPosV2 = 0;
+  for (int i = 0; i < (int)VT.getVectorNumElements(); ++i)
+    if (Mask[i] == i)
+      ++CorrectPosV1;
+    else if (Mask[i] == i + 4)
+      ++CorrectPosV2;
+
+  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+    // We have 3 elements from one vector, and one from another.
+    return true;
+
+  return false;
+}
+
 //
 // Some special combinations that can be optimized.
 //
@@ -7257,6 +7280,67 @@
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+                           SelectionDAG &DAG, bool HasAVX) {
+  // Generate an insertps instruction when inserting an f32 from memory onto a
+  // v4f32 or when copying a member from one v4f32 to another.
+  // TODO: Optimize for AVX cases too (VINSERTPS)
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  auto Mask = SVOp->getMask();
+  assert(VT == MVT::v4f32 && "unsupported vector type for insertps");
+
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; });
+
+  SDValue From;
+  SDValue To;
+  unsigned DestIndex;
+  if (FromV1 == 1) {
+    From = V1;
+    To = V2;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; }) -
+                Mask.begin();
+  } else {
+    From = V2;
+    To = V1;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i >= 4; }) -
+                Mask.begin();
+  }
+
+  if (MayFoldLoad(From)) {
+    // Trivial case, when From comes from a load and is only used by the
+    // shuffle. Make it use insertps from the vector that we need from that
+    // load.
+    SDValue Addr = From.getOperand(1);
+    SDValue NewAddr =
+        DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+                    DAG.getConstant(DestIndex * EVT.getStoreSize(),
+                                    Addr.getSimpleValueType()));
+
+    LoadSDNode *Load = cast<LoadSDNode>(From);
+    SDValue Ld = DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                             DAG.getMachineFunction().getMachineMemOperand(
+                                 Load->getMemOperand(), 0, EVT.getStoreSize()));
+
+    // Create this as a scalar to vector to match the instruction pattern.
+    SDValue LoadScalarToVector =
+        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Ld);
+    SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+                       InsertpsMask);
+  }
+
+  // Vector-element-to-vector
+  unsigned SrcIndex = Mask[DestIndex] % 4;
+  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
 // Reduce a vector shuffle to zext.
 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                                     SelectionDAG &DAG) {
@@ -7416,6 +7500,7 @@
   bool V1IsSplat = false;
   bool V2IsSplat = false;
   bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasSSE4 = Subtarget->hasSSE41();
   bool HasFp256    = Subtarget->hasFp256();
   bool HasInt256   = Subtarget->hasInt256();
   MachineFunction &MF = DAG.getMachineFunction();
@@ -7641,6 +7726,9 @@
   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
 
+  if (HasSSE4 && isINSERTPSMask(M, VT))
+    return getINSERTPS(SVOp, dl, DAG, Subtarget->hasAVX());
+
   //===--------------------------------------------------------------------===//
   // Generate target specific nodes for 128 or 256-bit shuffles only
   // supported in the AVX instruction set.
Index: test/CodeGen/X86/sse41.ll
===================================================================
--- test/CodeGen/X86/sse41.ll
+++ test/CodeGen/X86/sse41.ll
@@ -249,3 +249,31 @@
 ; X64: ret
 }
 
+define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x float>* %pb, align 16
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit6
+; X32-LABEL: insertps_from_shufflevector_1:
+; X32-NOT: shufps
+; X32: insertps
+; X32: ret
+; X64-LABEL: insertps_from_shufflevector_1:
+; X64-NOT: shufps
+; X64: insertps
+; X64: ret
+}
+
+define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecinit6
+; X32-LABEL: insertps_from_shufflevector_2:
+; X32: insertps
+; X32-NOT: shufps
+; X32: ret
+; X64-LABEL: insertps_from_shufflevector_2:
+; X64: insertps
+; X64-NOT: shufps
+; X64: ret
+}