Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9002,14 +9002,27 @@ !isNullConstant(Extract->getOperand(1))) return SDValue(); - SDValue Op = Extract->getOperand(0); - // Match against one of the candidate binary ops. + SDValue Op = Extract->getOperand(0); if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) { return Op.getOpcode() == unsigned(BinOp); })) return SDValue(); + + // Floating-point reductions may require relaxed constraints on the final step + // of the reduction because they may reorder intermediate operations. unsigned CandidateBinOp = Op.getOpcode(); + if (Op.getValueType().isFloatingPoint()) { + SDNodeFlags Flags = Op->getFlags(); + switch (CandidateBinOp) { + case ISD::FADD: + if (!Flags.hasNoSignedZeros() || !Flags.hasAllowReassociation()) + return SDValue(); + break; + default: + llvm_unreachable("Unhandled FP opcode for binop reduction"); + } + } // Matching failed - attempt to see if we did enough stages that a partial // reduction from a subvector is possible. Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -35395,9 +35395,9 @@ const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); - // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. ISD::NodeType Opc; - SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}, true); + SDValue Rdx = + DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true); if (!Rdx) return SDValue(); Index: llvm/trunk/test/CodeGen/X86/haddsub.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/haddsub.ll +++ llvm/trunk/test/CodeGen/X86/haddsub.ll @@ -1645,10 +1645,8 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v8f32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: addps %xmm2, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm2 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE3-FAST-NEXT: addps %xmm1, %xmm2 +; SSE3-FAST-NEXT: haddps %xmm1, %xmm2 +; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq @@ -1668,9 +1666,8 @@ ; AVX-FAST-LABEL: fadd_reduce_v8f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper @@ -1691,9 +1688,9 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v4f64: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: addpd %xmm2, %xmm1 -; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1 -; SSE3-FAST-NEXT: addsd %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2 +; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2 +; SSE3-FAST-NEXT: addsd %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: fadd_reduce_v4f64: @@ -1709,7 +1706,7 @@ ; AVX-FAST-LABEL: fadd_reduce_v4f64: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper @@ -2017,8 +2014,7 @@ ; ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq @@ -2030,6 +2026,9 @@ ret float %r } +; Negative test - only the flags on the final math op in the +; sequence determine whether we can transform to horizontal ops. + define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) { ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: ; SSE3-SLOW: # %bb.0: @@ -2105,8 +2104,7 @@ ; ; AVX-FAST-LABEL: partial_reduction_fadd_v16f32: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -90,8 +90,7 @@ ; ; AVX1-FAST-LABEL: test_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq @@ -156,9 +155,8 @@ ; AVX1-FAST-LABEL: test_v8f32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper @@ -350,8 +348,7 @@ ; ; AVX1-FAST-LABEL: test_v4f32_zero: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -411,9 +408,8 @@ ; AVX1-FAST-LABEL: test_v8f32_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -597,8 +593,7 @@ ; ; AVX1-FAST-LABEL: test_v4f32_undef: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -658,9 +653,8 @@ ; AVX1-FAST-LABEL: test_v8f32_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -834,7 +828,7 @@ ; AVX1-FAST-LABEL: test_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper @@ -1053,7 +1047,7 @@ ; AVX1-FAST-LABEL: test_v4f64_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -1260,7 +1254,7 @@ ; AVX1-FAST-LABEL: test_v4f64_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq