Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -7851,10 +7851,16 @@
   return DstVec;
 }
 
-/// Return true if \p N implements a horizontal binop and return the
-/// operands for the horizontal binop into V0 and V1.
-///
 /// This is a helper function of LowerToHorizontalOp().
+/// This function checks that the build_vector \p N in input implements an x86
+/// 128-bit horizontal operation. It is also used to match 256-bit horizontal
+/// operations, but that requires the caller to adjust the results to produce
+/// a valid x86 256-bit horizontal instruction. In other words, if this returns
+/// true for a 256-bit input, then some extraction/insertion will be required to
+/// produce a horizontal instruction because the vector element indexes do not
+/// correspond to x86's 256-bit AVX ops.
+///
+/// Parameter \p Opcode defines the kind of horizontal operation to match.
 /// This function checks that the build_vector \p N in input implements a
 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
 /// operation to match.
@@ -8263,14 +8269,21 @@
     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
-        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
-      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
-
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) {
+      SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
+      SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
+      assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
+      return DAG.getNode(X86ISD::FHADD, DL, VT, V0, V1);
+    }
     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
-        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
-      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) {
+      SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
+      SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
+      assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, V0, V1);
+    }
   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
     // Try to match an AVX2 horizontal add/sub of signed integers.
     SDValue InVec2, InVec3;
@@ -8291,10 +8304,14 @@
       CanFold = false;
 
     if (CanFold) {
+      SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
+      SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
+      assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
+
       // Fold this build_vector into a single horizontal add/sub.
       // Do this only if the target has AVX2.
       if (Subtarget.hasAVX2())
-        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+        return DAG.getNode(X86Opcode, DL, VT, V0, V1);
 
       // Do not try to expand this build_vector into a pair of horizontal
       // add/sub if we can emit a pair of scalar add/sub.
@@ -8305,13 +8322,16 @@
       // a concat vector.
       bool isUndefLO = NumUndefsLO == Half;
       bool isUndefHI = NumUndefsHI == Half;
-      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+      return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false,
                                    isUndefLO, isUndefHI);
     }
   }
 
   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
        VT == MVT::v16i16) && Subtarget.hasAVX()) {
+    // WARNING: Matching 256-bit horizontal ops here requires
+    // extracting/inserting from the source vectors.
+    // isHorizontalBinOp() does not match the element order for 256-bit AVX ops.
     unsigned X86Opcode;
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
       X86Opcode = X86ISD::HADD;
Index: test/CodeGen/X86/haddsub-undef.ll
===================================================================
--- test/CodeGen/X86/haddsub-undef.ll
+++ test/CodeGen/X86/haddsub-undef.ll
@@ -300,7 +300,7 @@
 ;
 ; AVX-LABEL: test11_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
@@ -934,12 +934,12 @@
 ;
 ; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567:
 ; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
 ; AVX1-SLOW-NEXT:    retq
 ;
 ; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567:
 ; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX1-FAST-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
 ; AVX1-FAST-NEXT:    retq
 ;
 ; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
@@ -973,7 +973,7 @@
 ;
 ; AVX-LABEL: PR40243:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %a4 = extractelement <8 x float> %a, i32 4
   %a5 = extractelement <8 x float> %a, i32 5
Index: test/CodeGen/X86/phaddsub-undef.ll
===================================================================
--- test/CodeGen/X86/phaddsub-undef.ll
+++ test/CodeGen/X86/phaddsub-undef.ll
@@ -75,12 +75,12 @@
 ;
 ; AVX2-LABEL: test15_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test15_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
@@ -101,16 +101,20 @@
 ;
 ; AVX1-LABEL: PR40243_alt:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR40243_alt:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: PR40243_alt:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %a4 = extractelement <8 x i32> %a, i32 4
   %a5 = extractelement <8 x i32> %a, i32 5