Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -10473,6 +10473,10 @@
     }
 
     if (EltVT == MVT::f32) {
+      // FIXME: We should generate a BLENDI here if we're just inserting from
+      // and to the low lane and not zeroing (IdxVal == 0).
+      // BLENDPS has better performance than INSERTPS in that case.
+      
       // Bits [7:6] of the constant are the source select.  This will always be
       //  zero here.  The DAG Combiner may combine an extract_elt index into
       //  these
@@ -22947,17 +22951,19 @@
   MVT VT = N->getOperand(1)->getSimpleValueType(0);
   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
          "X86insertps is only defined for v4x32");
-
-  SDValue Ld = N->getOperand(1);
-  if (MayFoldLoad(Ld)) {
+  
+  uint64_t Imm8 = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  
+  if (MayFoldLoad(N1)) {
     // Extract the countS bits from the immediate so we can get the proper
     // address when narrowing the vector load to a specific element.
     // When the second source op is a memory address, insertps doesn't use
     // countS and just gets an f32 from that address.
-    unsigned DestIndex =
-        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    unsigned DestIdx = Imm8 >> 6;
     
-    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+    SDValue Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(N1), DestIdx, DAG);
 
     // Create this as a scalar to vector to match the instruction pattern.
     SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
@@ -22966,6 +22972,27 @@
     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
                        LoadScalarToVector, N->getOperand(2));
   }
+  
+  // A register/register insertps that is just moving the low 32-bits to the
+  // corresponding location in the destination register can be simplified into
+  // a blendps. Blendps should have equal or better performance because it's a
+  // simpler operation.
+  // This also allows us to eliminate some pattern-matching possibilities for
+  // scalar SSE math ops that are performed in xmm registers and then shuffled.
+
+  // FIXME: If optimizing for size and there is a load folding opportunity,
+  // we should either not do this transform or we should undo it in
+  // PerformBLENDICombine. The above check for "MayFoldLoad" doesn't work
+  // because it doesn't look through a SCALAR_TO_VECTOR node.
+  
+  if (Imm8 == 0x00) {
+    // We do not convert insertps nodes if they are operating on anything
+    // other than the low element of the vector because that might cause an
+    // extra shuffle operation to be created.
+    SDValue NewMask = DAG.getConstant(0x01, MVT::i8);
+    return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, NewMask);
+  }
+  
   return SDValue();
 }
 
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -3179,13 +3179,6 @@
   
   // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too.
   let Predicates = [UseSSE41] in {
-    // extracted scalar math op with insert via insertps
-    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-          FR32:$src))), (iPTR 0))),
-      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
-          (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
     // extracted scalar math op with insert via blend
     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
           (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
@@ -3203,13 +3196,6 @@
   // Repeat everything for AVX, except for the movss + scalar combo...
   // because that one shouldn't occur with AVX codegen?
   let Predicates = [HasAVX] in {
-    // extracted scalar math op with insert via insertps
-    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-          FR32:$src))), (iPTR 0))),
-      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
-          (COPY_TO_REGCLASS FR32:$src, VR128))>;
- 
     // extracted scalar math op with insert via blend
     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
           (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
Index: test/CodeGen/X86/avx-load-store.ll
===================================================================
--- test/CodeGen/X86/avx-load-store.ll
+++ test/CodeGen/X86/avx-load-store.ll
@@ -23,20 +23,36 @@
 
 declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
 
-;;
-;; The two tests below check that we must fold load + scalar_to_vector
-;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory
+; Although this could have a load folded vinsertps, we prefer
+; to use vmovss + vblendps because it has better performance.
 
-; CHECK: mov00
-define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
+; CHECK-LABEL: mov_blendps:
+define <8 x float> @mov_blendps(<8 x float> %v, float * %ptr) nounwind {
   %val = load float, float* %ptr
-; CHECK: vinsertps
+; CHECK: vmovss
+; CHECK: vblendps
 ; CHECK: vinsertf128
   %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0
   ret <8 x float> %i0
 ; CHECK: ret
 }
 
+; Use vinsertps to load a scalar into a higher lane because there is no
+; version of vblendps that loads a scalar. Transferring out of the low
+; lane after a vmovss would require another shuffle operation.
+
+; CHECK-LABEL: mov_insertps:
+define <4 x float> @mov_insertps(<4 x float> %v, float * %ptr) nounwind {
+  %val = load float, float* %ptr
+; CHECK:  vinsertps	$29, (%rdi), %xmm0, %xmm0
+  %i0 = insertelement <4 x float> zeroinitializer, float %val, i32 1
+  ret <4 x float> %i0
+; CHECK: ret
+}
+
+;; This test checks that we must fold load + ins_subvec + zext 
+;; into only a single vmovlpd from memory
+
 ; CHECK: mov01
 define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
   %val = load double, double* %ptr
Index: test/CodeGen/X86/sse41.ll
===================================================================
--- test/CodeGen/X86/sse41.ll
+++ test/CodeGen/X86/sse41.ll
@@ -199,28 +199,33 @@
 
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
 
-define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
-; X32-LABEL: insertps_2:
+; In cases where either blendps or insertps will do the job,
+; prefer blendps because it has better performance.
+
+define <4 x float> @blendps_1(<4 x float> %t1, float %t2) nounwind {
+; X32-LABEL: blendps_1:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X32-NEXT:    movss 4(%esp), {{.*}}  mem[0],zero,zero,zero
+; X32-NEXT:    blendps {{.*#+}}         xmm0 = xmm{{.*}}[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: insertps_2:
+; X64-LABEL: blendps_1:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
   ret <4 x float> %tmp1
 }
-define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
-; X32-LABEL: insertps_3:
+
+define <4 x float> @blendps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
+; X32-LABEL: blendps_2:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: insertps_3:
+; X64-LABEL: blendps_2:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %tmp2 = extractelement <4 x float> %t2, i32 0
   %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0