Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -7446,6 +7446,17 @@
   return DAG.getConstant(Imm, MVT::i8);
 }
 
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  
+  return ISD::isNON_EXTLoad(V.getNode());
+}
+
 /// \brief Try to emit a blend instruction for a shuffle.
 ///
 /// This doesn't do any checks for the availability of instructions for blending
@@ -7470,6 +7481,13 @@
   }
   switch (VT.SimpleTy) {
   case MVT::v2f64:
+      // Canonicalize this particular blend because it saves us a bunch of
+      // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+      if (BlendMask == 2 && !isShuffleFoldableLoad(V1)) {
+        std::swap(V1, V2);
+        BlendMask = 1;
+      }
+      //FALLTHROUGH
   case MVT::v4f32:
   case MVT::v4f64:
   case MVT::v8f32:
@@ -8203,17 +8221,6 @@
   return SDValue();
 }
 
-/// \brief Helper to test for a load that can be folded with x86 shuffles.
-///
-/// This is particularly important because the set of instructions varies
-/// significantly based on whether the operand is a load or not.
-static bool isShuffleFoldableLoad(SDValue V) {
-  while (V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
-  return ISD::isNON_EXTLoad(V.getNode());
-}
-
 /// \brief Try to lower insertion of a single element into a zero vector.
 ///
 /// This is a common pattern that we have especially efficient patterns to lower
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -3161,8 +3161,7 @@
 //   addss %xmm1, %xmm0
 
 // TODO: Some canonicalization in lowering would simplify the number of
-// patterns we have to try to match. In particular, the reversed order blends
-// seem unnecessary.
+// patterns we have to try to match.
 multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
   let Predicates = [UseSSE1] in {
     // extracted scalar math op with insert via movss
@@ -3263,16 +3262,9 @@
     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
           (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
       (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
-
-    // vector math op with insert via blend (reversed order)
-    def : Pat<(v2f64 (X86Blendi
-          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-          (v2f64 VR128:$dst), (i8 2))),
-      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
   }
 
-  // Repeat everything for AVX and add one more pattern
-  // (the scalar + blend reversed order) for good measure.
+  // Repeat everything for AVX.
   let Predicates = [HasAVX] in {
     // extracted scalar math op with insert via movsd
     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
@@ -3288,13 +3280,6 @@
       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
           (COPY_TO_REGCLASS FR64:$src, VR128))>;
 
-    // extracted scalar math op with insert via blend (reversed order)
-    def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector
-          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-          FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
-          (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
     // vector math op with insert via movsd
     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
           (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
@@ -3304,12 +3289,6 @@
     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
           (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
-
-    // vector math op with insert via blend (reversed order)
-    def : Pat<(v2f64 (X86Blendi
-          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-          (v2f64 VR128:$dst), (i8 2))),
-      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
   }
 }