Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7851,10 +7851,16 @@ return DstVec; } -/// Return true if \p N implements a horizontal binop and return the -/// operands for the horizontal binop into V0 and V1. -/// /// This is a helper function of LowerToHorizontalOp(). +/// This function checks that the build_vector \p N in input implements an x86 +/// 128-bit horizontal operation. It is also used to match 256-bit horizontal +/// operations, but that requires the caller to adjust the results to produce +/// a valid x86 256-bit horizontal instruction. In other words, if this returns +/// true for a 256-bit input, then some extraction/insertion will be required to +/// produce a horizontal instruction because the vector element indexes do not +/// correspond to x86's 256-bit AVX ops. +/// +/// Parameter \p Opcode defines the kind of horizontal operation to match. /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal /// operation to match. @@ -8263,14 +8269,21 @@ if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && - ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) - return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - + ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) { + SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; + SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; + assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); + return DAG.getNode(X86ISD::FHADD, DL, VT, V0, V1); + } if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && - ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) - return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); + ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) { + SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; + SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; + assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); + return DAG.getNode(X86ISD::FHSUB, DL, VT, V0, V1); + } } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. SDValue InVec2, InVec3; @@ -8291,10 +8304,14 @@ CanFold = false; if (CanFold) { + SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; + SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; + assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); + // Fold this build_vector into a single horizontal add/sub. // Do this only if the target has AVX2. if (Subtarget.hasAVX2()) - return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); + return DAG.getNode(X86Opcode, DL, VT, V0, V1); // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. @@ -8305,13 +8322,16 @@ // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, + return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget.hasAVX()) { + // WARNING: Matching 256-bit horizontal ops here requires + // extracting/inserting from the source vectors. + // isHorizontalBinOp() does not match the element order for 256-bit AVX ops. unsigned X86Opcode; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; Index: test/CodeGen/X86/haddsub-undef.ll =================================================================== --- test/CodeGen/X86/haddsub-undef.ll +++ test/CodeGen/X86/haddsub-undef.ll @@ -300,7 +300,7 @@ ; ; AVX-LABEL: test11_undef: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %a, i32 1 @@ -934,12 +934,12 @@ ; ; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: retq ; ; AVX512-LABEL: v16f32_inputs_v8f32_output_4567: @@ -973,7 +973,7 @@ ; ; AVX-LABEL: PR40243: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %a4 = extractelement <8 x float> %a, i32 4 %a5 = extractelement <8 x float> %a, i32 5 Index: test/CodeGen/X86/phaddsub-undef.ll =================================================================== --- test/CodeGen/X86/phaddsub-undef.ll +++ test/CodeGen/X86/phaddsub-undef.ll @@ -75,12 +75,12 @@ ; ; AVX2-LABEL: test15_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test15_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %vecext = extractelement <8 x i32> %a, i32 0 %vecext1 = extractelement <8 x i32> %a, i32 1 @@ -101,16 +101,20 @@ ; ; AVX1-LABEL: PR40243_alt: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR40243_alt: ; AVX2: # %bb.0: -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR40243_alt: ; AVX512: # %bb.0: -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %a4 = extractelement <8 x i32> %a, i32 4 %a5 = extractelement <8 x i32> %a, i32 5