Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9002,8 +9002,15 @@ !isNullConstant(Extract->getOperand(1))) return SDValue(); + // Floating-point reductions require reassociability and no-signed-zeros + // on the final step of the reduction because we may be changing the order of + // intermediate operations. SDValue Op = Extract->getOperand(0); - unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements()); + if (Op.getValueType().isFloatingPoint()) { + SDNodeFlags Flags = Op->getFlags(); + if (!Flags.hasNoSignedZeros() || !Flags.hasAllowReassociation()) + return SDValue(); + } // Match against one of the candidate binary ops. if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) { @@ -9041,6 +9048,7 @@ // While a partial reduction match would be: // <2,3,u,u,u,u,u,u> // <1,u,u,u,u,u,u,u> + unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements()); SDValue PrevOp; for (unsigned i = 0; i < Stages; ++i) { unsigned MaskEnd = (1 << i); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35425,9 +35425,9 @@ const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); - // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. ISD::NodeType Opc; - SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}, true); + SDValue Rdx = + DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true); if (!Rdx) return SDValue(); Index: llvm/test/CodeGen/X86/haddsub.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub.ll +++ llvm/test/CodeGen/X86/haddsub.ll @@ -1645,10 +1645,8 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v8f32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: addps %xmm2, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm2 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE3-FAST-NEXT: addps %xmm1, %xmm2 +; SSE3-FAST-NEXT: haddps %xmm1, %xmm2 +; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq @@ -1668,9 +1666,8 @@ ; AVX-FAST-LABEL: fadd_reduce_v8f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper @@ -1691,9 +1688,9 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v4f64: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: addpd %xmm2, %xmm1 -; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1 -; SSE3-FAST-NEXT: addsd %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2 +; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2 +; SSE3-FAST-NEXT: addsd %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: fadd_reduce_v4f64: @@ -1709,7 +1706,7 @@ ; AVX-FAST-LABEL: fadd_reduce_v4f64: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper Index: llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -90,8 +90,7 @@ ; ; AVX1-FAST-LABEL: test_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq @@ -156,9 +155,8 @@ ; AVX1-FAST-LABEL: test_v8f32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper @@ -350,8 +348,7 @@ ; ; AVX1-FAST-LABEL: test_v4f32_zero: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -411,9 +408,8 @@ ; AVX1-FAST-LABEL: test_v8f32_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -597,8 +593,7 @@ ; ; AVX1-FAST-LABEL: test_v4f32_undef: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -658,9 +653,8 @@ ; AVX1-FAST-LABEL: test_v8f32_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -834,7 +828,7 @@ ; AVX1-FAST-LABEL: test_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper @@ -1053,7 +1047,7 @@ ; AVX1-FAST-LABEL: test_v4f64_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -1260,7 +1254,7 @@ ; AVX1-FAST-LABEL: test_v4f64_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq