Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -3931,6 +3931,29 @@
   return true;
 }
 
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+  // TODO: Deal with AVX's VINSERTPS
+  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+    return false;
+
+  unsigned CorrectPosV1 = 0;
+  unsigned CorrectPosV2 = 0;
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+    if (Mask[i] == i)
+      ++CorrectPosV1;
+    else if (Mask[i] == i + 4)
+      ++CorrectPosV2;
+
+  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+    // We have 3 elements from one vector, and one from another.
+    return true;
+
+  return false;
+}
+
 //
 // Some special combinations that can be optimized.
 //
@@ -7263,6 +7286,84 @@
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+                           SelectionDAG &DAG) {
+  // Generate an insertps instruction when inserting an f32 from memory onto a
+  // v4f32 or when copying a member from one v4f32 to another.
+  // We also use it for transferring i32 from one register to another,
+  // since it simply copies the same bits.
+  // If we're transfering an i32 from memory to a specific element in a
+  // register, we output a generic DAG that will match the PINSRD
+  // instruction.
+  // TODO: Optimize for AVX cases too (VINSERTPS)
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  auto Mask = SVOp->getMask();
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "unsupported vector type for insertps/pinsrd");
+
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; });
+
+  SDValue From;
+  SDValue To;
+  unsigned DestIndex;
+  if (FromV1 == 1) {
+    From = V1;
+    To = V2;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; }) -
+                Mask.begin();
+  } else {
+    From = V2;
+    To = V1;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i >= 4; }) -
+                Mask.begin();
+  }
+
+  if (MayFoldLoad(From)) {
+    // Trivial case, when From comes from a load and is only used by the
+    // shuffle. Make it use insertps from the vector that we need from that
+    // load.
+    SDValue Addr = From.getOperand(1);
+    SDValue NewAddr =
+        DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+                    DAG.getConstant(DestIndex * EVT.getStoreSize(),
+                                    Addr.getSimpleValueType()));
+
+    LoadSDNode *Load = cast<LoadSDNode>(From);
+    SDValue NewLoad =
+        DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Load->getMemOperand(), 0, EVT.getStoreSize()));
+
+    if (EVT == MVT::f32) {
+      // Create this as a scalar to vector to match the instruction pattern.
+      SDValue LoadScalarToVector =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+                         InsertpsMask);
+    } else { // EVT == MVT::i32
+      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+      // instruction, to match the PINSRD instruction, which loads an i32 to a
+      // certain vector element.
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+                         DAG.getConstant(DestIndex, MVT::i32));
+    }
+  }
+
+  // Vector-element-to-vector
+  unsigned SrcIndex = Mask[DestIndex] % 4;
+  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
 // Reduce a vector shuffle to zext.
 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                                     SelectionDAG &DAG) {
@@ -7674,6 +7775,9 @@
   if (BlendOp.getNode())
     return BlendOp;
 
+  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+    return getINSERTPS(SVOp, dl, DAG);
+
   unsigned Imm8;
   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
Index: test/CodeGen/X86/sse41.ll
===================================================================
--- test/CodeGen/X86/sse41.ll
+++ test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
 
 @g16 = external global i16
 
@@ -249,3 +249,74 @@
 ; X64: ret
 }
 
+define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x float>* %pb, align 16
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_1:
+; CHECK-NOT: shufps
+; CHECK: insertps    $48,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_2:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps    $96,
+; CHECK: ret
+}
+
+; For loading an i32 from memory into an xmm register we use pinsrd
+; instead of insertps
+define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x i32>* %pb, align 16
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: pinsrd_from_shufflevector_i32:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: pinsrd  $3,
+; CHECK: ret
+}
+
+define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_i32_2:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps    $208,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps    $16,
+; CHECK: ret
+  %1 = load float* %b, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
+; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
+;; aCHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps    $32,
+; CHECK: ret
+  %1 = load i32* %b, align 4
+  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
+  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x i32> %result
+}