Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -10438,6 +10438,9 @@
     }
 
     if (EltVT == MVT::f32) {
+      // FIXME: We should generate a BLENDI here if we're not crossing lanes.
+      // BLENDPS has better performance than INSERTPS.
+      
       // Bits [7:6] of the constant are the source select.  This will always be
       //  zero here.  The DAG Combiner may combine an extract_elt index into
       //  these
@@ -22897,17 +22900,19 @@
   MVT VT = N->getOperand(1)->getSimpleValueType(0);
   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
          "X86insertps is only defined for v4x32");
-
-  SDValue Ld = N->getOperand(1);
-  if (MayFoldLoad(Ld)) {
+  
+  auto Imm8 = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  
+  if (MayFoldLoad(N1)) {
     // Extract the countS bits from the immediate so we can get the proper
     // address when narrowing the vector load to a specific element.
     // When the second source op is a memory address, insertps doesn't use
     // countS and just gets an f32 from that address.
-    unsigned DestIndex =
-        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    auto DestIndex = Imm8 >> 6;
     
-    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+    auto Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(N1), DestIndex, DAG);
 
     // Create this as a scalar to vector to match the instruction pattern.
     SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
@@ -22916,7 +22921,41 @@
     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
                        LoadScalarToVector, N->getOperand(2));
   }
-  return SDValue();
+  
+  // A register/register insertps that is just moving 32-bits to the
+  // corresponding location in the destination register can be simplified into
+  // a blendps. Both instructions are vector permutes, but blendps
+  // may have superior performance because it's a simpler operation.
+  // This also allows us to eliminate some pattern-matching possibilities for
+  // scalar SSE math ops that are performed in xmm registers and then shuffled.
+
+  // FIXME: If optimizing for size and there is a load folding opportunity,
+  // we should either not do this transform or we should undo it in
+  // PerformBLENDICombine. The above check for "MayFoldLoad" doesn't work
+  // because it doesn't look through a SCALAR_TO_VECTOR node.
+  
+  switch (Imm8) {
+    default: return SDValue();
+
+    // The insertps immediate for the register/register variant is:
+    // Bits [7:6] - select exactly one of four 32-bit source lanes
+    // Bits [5:4] - select exactly one of four 32-bit destination lanes
+    // Bits [3:0] - zero mask bonus operation
+
+    // The blendps immediate is:
+    // Bits [3:0] - if a bit is set, copy the 32-bit source lane to the
+    // corresponding destination lane.
+
+    // To do this transform, we need the source select bits [7:6] to match
+    // the destination select bits [5:4] and zero mask bits [3:0] must be off.
+
+    case 0x00: Imm8 = 0x01; break; // copy src bits [31:0] to dest
+    case 0x50: Imm8 = 0x02; break; // copy src bits [63:32] to dest
+    case 0xA0: Imm8 = 0x04; break; // copy src bits [95:64] to dest
+    case 0xF0: Imm8 = 0x08; break; // copy src bits [127:96] to dest
+  }
+  SDValue NewMask = DAG.getConstant(Imm8, MVT::i8);
+  return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, NewMask);
 }
 
 static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -3179,13 +3179,6 @@
   
   // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too.
   let Predicates = [UseSSE41] in {
-    // extracted scalar math op with insert via insertps
-    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-          FR32:$src))), (iPTR 0))),
-      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
-          (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
     // extracted scalar math op with insert via blend
     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
           (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
@@ -3202,14 +3195,7 @@
 
   // Repeat everything for AVX, except for the movss + scalar combo...
   // because that one shouldn't occur with AVX codegen?
-  let Predicates = [HasAVX] in {
-    // extracted scalar math op with insert via insertps
-    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-          FR32:$src))), (iPTR 0))),
-      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
-          (COPY_TO_REGCLASS FR32:$src, VR128))>;
- 
+  let Predicates = [HasAVX] in { 
     // extracted scalar math op with insert via blend
     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
           (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
Index: test/CodeGen/X86/avx-load-store.ll
===================================================================
--- test/CodeGen/X86/avx-load-store.ll
+++ test/CodeGen/X86/avx-load-store.ll
@@ -23,20 +23,25 @@
 
 declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
 
-;;
-;; The two tests below check that we must fold load + scalar_to_vector
-;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory
+
+; Although this could have a load folded vinsertps, we prefer
+; to use vblendps because it has better performance.
+; FIXME: If optimizing for size, we should generate a vinsertps.
 
 ; CHECK: mov00
 define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
   %val = load float* %ptr
-; CHECK: vinsertps
+; CHECK: vblendps
 ; CHECK: vinsertf128
   %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0
   ret <8 x float> %i0
 ; CHECK: ret
 }
 
+;;
+;; This test checks that we must fold load
+;; + ins_subvec + zext into only a single vmovlpd from memory
+
 ; CHECK: mov01
 define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
   %val = load double* %ptr
Index: test/CodeGen/X86/sse41.ll
===================================================================
--- test/CodeGen/X86/sse41.ll
+++ test/CodeGen/X86/sse41.ll
@@ -199,28 +199,36 @@
 
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
 
-define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
-; X32-LABEL: insertps_2:
+; In cases where either blendps or insertps will do the job,
+; prefer blendps because it has better performance.
+
+; FIXME: If optimizing for size and there is a load folding opportunity,
+; we should generate a vinsertps.
+
+define <4 x float> @blendps_1(<4 x float> %t1, float %t2) nounwind {
+; X32-LABEL: blendps_1:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X32-NEXT:    movss 4(%esp), {{.*}}  mem[0],zero,zero,zero
+; X32-NEXT:    blendps {{.*#+}}         xmm0 = xmm{{.*}}[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: insertps_2:
+; X64-LABEL: blendps_1:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
   ret <4 x float> %tmp1
 }
-define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
-; X32-LABEL: insertps_3:
+
+define <4 x float> @blendps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
+; X32-LABEL: blendps_2:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: insertps_3:
+; X64-LABEL: blendps_2:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %tmp2 = extractelement <4 x float> %t2, i32 0
   %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0