Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -678,6 +678,8 @@ // Don't recurse exponentially. if (Depth > 6) return 0; + bool UnsafeFPMath = Options->UnsafeFPMath || Op->isFast(); + switch (Op.getOpcode()) { default: return false; case ISD::ConstantFP: { @@ -691,7 +693,7 @@ } case ISD::FADD: // FIXME: determine better conditions for this xform. - if (!Options->UnsafeFPMath) return 0; + if (!UnsafeFPMath) return 0; // After operation legalization, it might not be legal to create new FSUBs. if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) @@ -715,7 +717,7 @@ case ISD::FMUL: case ISD::FDIV: - if (Options->HonorSignDependentRoundingFPMath()) return 0; + if (Options->HonorSignDependentRoundingFPMathOption && !UnsafeFPMath) return 0; // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, @@ -753,7 +755,7 @@ } case ISD::FADD: // FIXME: determine better conditions for this xform. - assert(Options.UnsafeFPMath); + assert(Options.UnsafeFPMath || Op->isFast()); // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, @@ -6742,10 +6744,12 @@ // FIXME: Instead of testing for UnsafeFPMath, this should be checking for // no signed zeros as well as no nans. + SDValue Cmp = N0.getOperand(2); const TargetOptions &Options = DAG.getTarget().Options; - if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() && + bool UnsafeFPMath = Options.UnsafeFPMath || Cmp->isFast(); + if (UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() && DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { - ISD::CondCode CC = cast(N0.getOperand(2))->get(); + ISD::CondCode CC = cast(Cmp)->get(); if (SDValue FMinMax = combineMinNumMaxNum( DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG)) @@ -9566,8 +9570,10 @@ if (!HasFMAD && !HasFMA) return SDValue(); + bool CanFuse = Options.UnsafeFPMath || N->isFast(); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + CanFuse || HasFMAD); + SDNodeFlags Flags = N->getFlags(); // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); @@ -9596,15 +9602,19 @@ // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), N1); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(0), N0.getOperand(1), N1); + Val->setFlags(Flags); + return Val; } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N1.getOperand(0), N1.getOperand(1), N0); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + N1.getOperand(0), N1.getOperand(1), N0); + Val->setFlags(Flags); + return Val; } // Look through FP_EXTEND nodes to do more combining. @@ -9614,11 +9624,13 @@ SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), N1); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), N1); + Val->setFlags(Flags); + return Val; } } @@ -9628,56 +9640,65 @@ SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(1)), N0); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(1)), N0); + Val->setFlags(Flags); + return Val; } } // More folding opportunities when target permits. if (Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && + if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(2).getOperand(0), - N0.getOperand(2).getOperand(1), - N1)); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(2).getOperand(0), + N0.getOperand(2).getOperand(1), + N1); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(0), N0.getOperand(1), + Fma); + Val->setFlags(Flags); + return Val; } // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && + if (CanFuse && N1->getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FMUL && N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N1.getOperand(0), N1.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - N1.getOperand(2).getOperand(0), - N1.getOperand(2).getOperand(1), - N0)); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + N1.getOperand(2).getOperand(0), + N1.getOperand(2).getOperand(1), + N0); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + N1.getOperand(0), N1.getOperand(1), + Fma); + Val->setFlags(Flags); + return Val; } - // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, U), - DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z)); + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, + SDNodeFlags Flags) { + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, U), + DAG.getNode(ISD::FP_EXTEND, SL, VT, V), + Z); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, Fma); + Val->setFlags(Flags); + return Val; }; if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); @@ -9687,7 +9708,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), - N1); + N1, Flags); } } } @@ -9698,14 +9719,19 @@ // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. auto FoldFAddFPExtFMAFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, X), - DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, U), - DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z)); + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, + SDNodeFlags Flags) { + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, U), + DAG.getNode(ISD::FP_EXTEND, SL, VT, V), + Z); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, X), + DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), + Fma); + Val->setFlags(Flags); + return Val; }; if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); @@ -9715,7 +9741,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), - N1); + N1, Flags); } } } @@ -9730,7 +9756,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), - N0); + N0, Flags); } } } @@ -9748,7 +9774,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), - N0); + N0, Flags); } } } @@ -9777,8 +9803,11 @@ if (!HasFMAD && !HasFMA) return SDValue(); + bool CanFuse = Options.UnsafeFPMath || N->isFast(); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + CanFuse || HasFMAD); + const SDNodeFlags Flags = N->getFlags(); + // If the subtraction is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); @@ -9801,27 +9830,35 @@ // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(ISD::FNEG, SL, VT, N1)); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(0), N0.getOperand(1), + DAG.getNode(ISD::FNEG, SL, VT, N1)); + Val->setFlags(Flags); + return Val; } // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. - if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) - return DAG.getNode(PreferredFusedOpcode, SL, VT, + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { + SDValue Val = + DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0); + Val->setFlags(Flags); + return Val; + } // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N00), N01, - DAG.getNode(ISD::FNEG, SL, VT, N1)); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N00), N01, + DAG.getNode(ISD::FNEG, SL, VT, N1)); + Val->setFlags(Flags); + return Val; } // Look through FP_EXTEND nodes to do more combining. @@ -9832,12 +9869,14 @@ SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, N1)); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, N1)); + Val->setFlags(Flags); + return Val; } } @@ -9848,13 +9887,15 @@ SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(0))), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(1)), - N0); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(0))), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(1)), + N0); + Val->setFlags(Flags); + return Val; } } @@ -9876,7 +9917,7 @@ N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), - N1)); + N1), Flags); } } } @@ -9899,7 +9940,7 @@ N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), - N1)); + N1), Flags); } } } @@ -9908,39 +9949,38 @@ if (Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && + if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(2).getOperand(0), - N0.getOperand(2).getOperand(1), - DAG.getNode(ISD::FNEG, SL, VT, - N1))); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(2).getOperand(0), + N0.getOperand(2).getOperand(1), + DAG.getNode(ISD::FNEG, SL, VT, N1)); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(0), N0.getOperand(1), Fma); + Val->setFlags(Flags); + return Val; } // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && + if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - N1.getOperand(0)), - N1.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N20), - - N21, N0)); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N20), + N21, N0); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + N1.getOperand(0)), + N1.getOperand(1), Fma); + Val->setFlags(Flags); + return Val; } - // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { @@ -9949,15 +9989,18 @@ SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N020.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N020.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, - N1))); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N020.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N020.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, + N1)); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(0), N0.getOperand(1), Fma); + Val->setFlags(Flags); + return Val; } } } @@ -9974,18 +10017,22 @@ SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N002.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N002.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, - N1))); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N002.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N002.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, + N1)); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), + Fma); + Val->setFlags(Flags); + return Val; } } } @@ -9999,16 +10046,20 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), - N1.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, - VT, N1200)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N1201), - N0)); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, + VT, N1200)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N1201), + N0); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), + N1.getOperand(1), + Fma); + Val->setFlags(Flags); + return Val; } } @@ -10028,18 +10079,21 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N100)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, - VT, N1020)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N1021), - N0)); + SDValue Fma = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, + VT, N1020)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), + N0); + Fma->setFlags(Flags); + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, + SL, VT, N100)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), + Fma); + Val->setFlags(Flags); + return Val; } } } @@ -10055,6 +10109,7 @@ SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); + const SDNodeFlags Flags = N->getFlags(); assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); @@ -10086,52 +10141,70 @@ // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) - auto FuseFADD = [&](SDValue X, SDValue Y) { + auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); - if (XC1 && XC1->isExactlyValue(+1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); - if (XC1 && XC1->isExactlyValue(-1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y)); + if (XC1 && XC1->isExactlyValue(+1.0)) { + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + Val->setFlags(Flags); + return Val; + } + if (XC1 && XC1->isExactlyValue(-1.0)) { + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + Val->setFlags(Flags); + return Val; + } } return SDValue(); }; - if (SDValue FMA = FuseFADD(N0, N1)) + if (SDValue FMA = FuseFADD(N0, N1, Flags)) return FMA; - if (SDValue FMA = FuseFADD(N1, N0)) + if (SDValue FMA = FuseFADD(N1, N0, Flags)) return FMA; // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) - auto FuseFSUB = [&](SDValue X, SDValue Y) { + auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); - if (XC0 && XC0->isExactlyValue(+1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - Y); - if (XC0 && XC0->isExactlyValue(-1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y)); + if (XC0 && XC0->isExactlyValue(+1.0)) { + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + Y); + Val->setFlags(Flags); + return Val; + } + if (XC0 && XC0->isExactlyValue(-1.0)) { + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + Val->setFlags(Flags); + return Val; + } auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); - if (XC1 && XC1->isExactlyValue(+1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y)); - if (XC1 && XC1->isExactlyValue(-1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + if (XC1 && XC1->isExactlyValue(+1.0)) { + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + Val->setFlags(Flags); + return Val; + } + if (XC1 && XC1->isExactlyValue(-1.0)) { + SDValue Val = DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + Val->setFlags(Flags); + return Val; + } } return SDValue(); }; - if (SDValue FMA = FuseFSUB(N0, N1)) + if (SDValue FMA = FuseFSUB(N0, N1, Flags)) return FMA; - if (SDValue FMA = FuseFSUB(N1, N0)) + if (SDValue FMA = FuseFSUB(N1, N0, Flags)) return FMA; return SDValue(); @@ -10202,7 +10275,7 @@ } // If 'unsafe math' is enabled, fold lots of things. - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.isFast()) { // No FP constant should be created after legalization as Instruction // Selection pass has a hard time dealing with FP constants. bool AllowNewConst = (Level < AfterLegalizeDAG); @@ -10337,7 +10410,7 @@ GetNegatedExpression(N1, DAG, LegalOperations), Flags); // FIXME: Auto-upgrade the target/function-level option. - if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) { // (fsub 0, B) -> -B if (N0CFP && N0CFP->isZero()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) @@ -10348,7 +10421,7 @@ } // If 'unsafe math' is enabled, fold lots of things. - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.isFast()) { // (fsub A, 0) -> A if (N1CFP && N1CFP->isZero()) return N0; @@ -10413,7 +10486,7 @@ if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.isFast()) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) return N1; @@ -10542,6 +10615,10 @@ SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; + // FMA nodes have flags that propagate to the created nodes. + const SDNodeFlags Flags = N->getFlags(); + bool UnsafeFPMath = Options.UnsafeFPMath || Flags.isFast(); + // Constant fold FMA. if (isa(N0) && isa(N1) && @@ -10549,7 +10626,7 @@ return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); } - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) @@ -10566,12 +10643,7 @@ !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); - // TODO: FMA nodes should have flags that propagate to the created nodes. - // For now, create a Flags object for use with reassociation math transforms. - SDNodeFlags Flags; - Flags.setAllowReassociation(true); - - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && isConstantFPBuildVectorOrConstantFP(N1) && @@ -10617,7 +10689,7 @@ } } - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, @@ -10646,7 +10718,7 @@ // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { - bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; + bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath || N->isFast(); const SDNodeFlags Flags = N->getFlags(); if (!UnsafeMath && !Flags.hasAllowReciprocal()) return SDValue(); @@ -10724,7 +10796,7 @@ if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || N->isFast()) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { // Compute the reciprocal 1.0 / c2. @@ -10833,17 +10905,15 @@ } SDValue DAGCombiner::visitFSQRT(SDNode *N) { - if (!DAG.getTarget().Options.UnsafeFPMath) + SDNodeFlags Flags = N->getFlags(); + if (!DAG.getTarget().Options.UnsafeFPMath && !Flags.isFast()) return SDValue(); SDValue N0 = N->getOperand(0); if (TLI.isFsqrtCheap(N0, DAG)) return SDValue(); - // TODO: FSQRT nodes should have flags that propagate to the created nodes. - // For now, create a Flags object for use with reassociation math transforms. - SDNodeFlags Flags; - Flags.setAllowReassociation(true); + // FSQRT nodes have flags that propagate to the created nodes. return buildSqrtEstimate(N0, Flags); } @@ -11138,10 +11208,14 @@ // single-step fp_round we want to fold to. // In other words, double rounding isn't the same as rounding. // Also, this is a value preserving truncation iff both fp_round's are. - if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { + if (DAG.getTarget().Options.UnsafeFPMath || N->isFast() || N0IsTrunc) { + const SDNodeFlags Flags = N->getFlags(); SDLoc DL(N); - return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), + SDValue Val = + DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL)); + Val->setFlags(Flags); + return Val; } } Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3317,7 +3317,7 @@ break; case ISD::FP_TO_FP16: DEBUG(dbgs() << "Legalizing FP_TO_FP16\n"); - if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { + if (!TLI.useSoftFloat() && (TM.Options.UnsafeFPMath || Node->isFast())) { SDValue Op = Node->getOperand(0); MVT SVT = Op.getSimpleValueType(); if ((SVT == MVT::f64 || SVT == MVT::f80) && Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4042,7 +4042,7 @@ break; case ISD::FNEG: // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 - if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB) + if ((getTarget().Options.UnsafeFPMath || Operand.getNode()->isFast()) && OpOpcode == ISD::FSUB) // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags? return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1), Operand.getOperand(0), Operand.getNode()->getFlags()); @@ -4435,7 +4435,7 @@ case ISD::FMUL: case ISD::FDIV: case ISD::FREM: - if (getTarget().Options.UnsafeFPMath) { + if (getTarget().Options.UnsafeFPMath || Flags.isFast()) { if (Opcode == ISD::FADD) { // x+0 --> x if (N2CFP && N2CFP->getValueAPF().isZero()) @@ -4811,7 +4811,7 @@ case ISD::FMUL: case ISD::FDIV: case ISD::FREM: - if (getTarget().Options.UnsafeFPMath) + if (getTarget().Options.UnsafeFPMath || Flags.isFast()) return N2; break; case ISD::MUL: Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4324,6 +4324,18 @@ } else Result = lowerRangeToAssertZExt(DAG, I, Result); + if (I.getType()->isFloatTy()) { + FastMathFlags FMF = I.getFastMathFlags(); + SDNodeFlags Flags; + Flags.setAllowReciprocal(FMF.allowReciprocal()); + Flags.setAllowContract(FMF.allowContract()); + Flags.setNoInfs(FMF.noInfs()); + Flags.setNoNaNs(FMF.noNaNs()); + Flags.setNoSignedZeros(FMF.noSignedZeros()); + Flags.setApproximateFuncs(FMF.approxFunc()); + Flags.setAllowReassociation(FMF.allowReassoc()); + Result->setFlags(Flags); + } setValue(&I, Result); } } @@ -5512,9 +5524,19 @@ case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break; } + FastMathFlags FMF = I.getFastMathFlags(); + SDNodeFlags Flags; + Flags.setAllowReciprocal(FMF.allowReciprocal()); + Flags.setAllowContract(FMF.allowContract()); + Flags.setNoInfs(FMF.noInfs()); + Flags.setNoNaNs(FMF.noNaNs()); + Flags.setNoSignedZeros(FMF.noSignedZeros()); + Flags.setApproximateFuncs(FMF.approxFunc()); + Flags.setAllowReassociation(FMF.allowReassoc()); setValue(&I, DAG.getNode(Opcode, sdl, getValue(I.getArgOperand(0)).getValueType(), - getValue(I.getArgOperand(0)))); + getValue(I.getArgOperand(0)), Flags)); + return nullptr; } case Intrinsic::minnum: { @@ -5545,13 +5567,25 @@ getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; - case Intrinsic::fma: - setValue(&I, DAG.getNode(ISD::FMA, sdl, - getValue(I.getArgOperand(0)).getValueType(), - getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)), - getValue(I.getArgOperand(2)))); + case Intrinsic::fma: { + FastMathFlags FMF = I.getFastMathFlags(); + SDNodeFlags Flags; + Flags.setAllowReciprocal(FMF.allowReciprocal()); + Flags.setAllowContract(FMF.allowContract()); + Flags.setNoInfs(FMF.noInfs()); + Flags.setNoNaNs(FMF.noNaNs()); + Flags.setNoSignedZeros(FMF.noSignedZeros()); + Flags.setApproximateFuncs(FMF.approxFunc()); + Flags.setAllowReassociation(FMF.allowReassoc()); + SDValue Val = DAG.getNode(ISD::FMA, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), + getValue(I.getArgOperand(2))); + Val->setFlags(Flags); + setValue(&I, Val); return nullptr; + } case Intrinsic::experimental_constrained_fadd: case Intrinsic::experimental_constrained_fsub: case Intrinsic::experimental_constrained_fmul: Index: test/CodeGen/AArch64/fma-aggressive.ll =================================================================== --- test/CodeGen/AArch64/fma-aggressive.ll +++ test/CodeGen/AArch64/fma-aggressive.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s | FileCheck %s + +define float @test1(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test1 +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %u, %v + %mul.2 = fmul fast float %x, %y + %fma = fadd fast float %mul.2, %mul.1 + %res = fadd fast float %fma, %z + ret float %res +} + +define float @test2(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test2 +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %y, %z + %mul.2 = fmul fast float %u, %v + %fma = fadd fast float %mul.2, %mul.1 + %res = fadd fast float %x, %fma + ret float %res +} + +define float @test3(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test3 +; CHECK-EVEN: fnmsub {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %x, %y + %res = fsub fast float %mul.1, %z + ret float %res +} + +define float @test4(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test4 +; CHECK-EVEN: fnmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %x, %y + %neg = fsub fast float -0.0, %mul.1 + %res = fsub fast float %neg, %z + ret float %res +} Index: test/CodeGen/PowerPC/fmf-propagation.ll =================================================================== --- test/CodeGen/PowerPC/fmf-propagation.ll +++ test/CodeGen/PowerPC/fmf-propagation.ll @@ -39,7 +39,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_contract2:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma contract {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_contract2:' define float @fmul_fadd_contract2(float %x, float %y, float %z) { @@ -86,7 +86,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_reassoc2:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_reassoc2:' define float @fmul_fadd_reassoc2(float %x, float %y, float %z) { @@ -109,7 +109,7 @@ ; The fadd is now fully 'fast'. This implies that contraction is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_fast1:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_fast1:' define float @fmul_fadd_fast1(float %x, float %y, float %z) { @@ -132,7 +132,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_fast2:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_fast2:' define float @fmul_fadd_fast2(float %x, float %y, float %z) { @@ -156,7 +156,7 @@ ; This is the minimum FMF needed for this transform - the FMA allows reassociation. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' @@ -192,7 +192,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' @@ -228,11 +228,11 @@ ; The FMA is now fully 'fast'. This implies that reassociation is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' -; GLOBALDEBUG: fmul reassoc {{t[0-9]+}} +; GLOBALDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:' define float @fmul_fma_fast1(float %x) { @@ -241,12 +241,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI8_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI8_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI8_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI8_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_fast1: @@ -264,11 +259,11 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' -; GLOBALDEBUG: fmul reassoc {{t[0-9]+}} +; GLOBALDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:' define float @fmul_fma_fast2(float %x) { @@ -277,12 +272,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI9_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI9_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI9_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_fast2: @@ -300,11 +290,11 @@ ; Reduced precision for sqrt is allowed - should use estimate and NR iterations. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:' -; FMFDEBUG: fsqrt {{t[0-9]+}} +; FMFDEBUG: fsqrt afn {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:' -; GLOBALDEBUG: fmul reassoc {{t[0-9]+}} +; GLOBALDEBUG: fmul afn {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:' define float @sqrt_afn(float %x) { @@ -340,17 +330,18 @@ ; The call is now fully 'fast'. This implies that approximation is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:' -; FMFDEBUG: fsqrt {{t[0-9]+}} +; FMFDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:' -; GLOBALDEBUG: fmul reassoc {{t[0-9]+}} +; GLOBALDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:' define float @sqrt_fast(float %x) { ; FMF-LABEL: sqrt_fast: -; FMF: # %bb.0: -; FMF-NEXT: xssqrtsp 1, 1 +; FMF: # %bb.1: +; FMF-NEXT: xsrsqrtesp 2, 1 +; FMF: fmr 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: sqrt_fast: Index: test/CodeGen/X86/fmf-flags.ll =================================================================== --- test/CodeGen/X86/fmf-flags.ll +++ test/CodeGen/X86/fmf-flags.ll @@ -7,9 +7,12 @@ define float @fast_recip_sqrt(float %x) { ; X64-LABEL: fast_recip_sqrt: ; X64: # %bb.0: -; X64-NEXT: sqrtss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: divss %xmm1, %xmm0 +; X64-NEXT: rsqrtss %xmm0, %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fast_recip_sqrt: @@ -53,9 +56,9 @@ define double @not_so_fast_mul_add(double %x) { ; X64-LABEL: not_so_fast_mul_add: ; X64: # %bb.0: -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movsd {{.*}}(%rip), %xmm1 ; X64-NEXT: mulsd %xmm0, %xmm1 -; X64-NEXT: addsd %xmm1, %xmm0 +; X64-NEXT: mulsd {{.*}}(%rip), %xmm0 ; X64-NEXT: movsd %xmm1, {{.*}}(%rip) ; X64-NEXT: retq ; @@ -64,7 +67,9 @@ ; X86-NEXT: fldl {{[0-9]+}}(%esp) ; X86-NEXT: fld %st(0) ; X86-NEXT: fmull {{\.LCPI.*}} -; X86-NEXT: fadd %st(0), %st(1) +; X86-NEXT: fxch %st(1) +; X86-NEXT: fmull {{\.LCPI.*}} +; X86-NEXT: fxch %st(1) ; X86-NEXT: fstpl mul1 ; X86-NEXT: retl %m = fmul double %x, 4.2 @@ -80,10 +85,14 @@ define float @not_so_fast_recip_sqrt(float %x) { ; X64-LABEL: not_so_fast_recip_sqrt: ; X64: # %bb.0: -; X64-NEXT: sqrtss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: divss %xmm1, %xmm0 -; X64-NEXT: movss %xmm1, {{.*}}(%rip) +; X64-NEXT: rsqrtss %xmm0, %xmm1 +; X64-NEXT: sqrtss %xmm0, %xmm2 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: movss %xmm2, {{.*}}(%rip) ; X64-NEXT: retq ; ; X86-LABEL: not_so_fast_recip_sqrt: Index: test/CodeGen/X86/fmf-flags_fma.ll =================================================================== --- test/CodeGen/X86/fmf-flags_fma.ll +++ test/CodeGen/X86/fmf-flags_fma.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mattr=+avx2,+fma -mtriple=x86_64-apple-macosx10.8.0 | FileCheck %s -check-prefix=X64 + +declare float @llvm.fma.f32(float %a, float %b, float %c); + +define float @fast_fmuladd_rep1(float %a , float %b , float %c) { +; X64-LABEL: fast_fmuladd_rep1: +; X64: # %bb.0: +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 + %mul.1 = fmul fast float %a, %b + %res = fadd fast float %mul.1, %c + ret float %res +} + +define float @fast_fmuladd_rep2(float %a , float %b , float %c) { +; X64-LABEL: fast_fmuladd_rep2: +; X64: # %bb.0: +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 + %mul.1 = fmul fast float %a, %b + %res = fadd fast float %c, %mul.1 + ret float %res +} + +define float @fast_fmuladd_rep3(half %a , half %b , float %c) { +; X64-LABEL: fast_fmuladd_rep3: +; X64: # %bb.0: +; X64: vfmadd213ss {{[0-9]+}}(%rsp), %xmm1, %xmm0 + %mul.1 = fmul fast half %a, %b + %ext = fpext half %mul.1 to float + %res = fadd fast float %ext, %c + ret float %res +} + +define float @fast_fmuladd_rep4(half %a , half %b , float %c) { +; X64-LABEL: fast_fmuladd_rep4: +; X64: # %bb.0: +; X64: vfmadd213ss {{[0-9]+}}(%rsp), %xmm1, %xmm0 + %mul.1 = fmul fast half %a, %b + %ext = fpext half %mul.1 to float + %res = fadd fast float %c, %ext + ret float %res +} Index: test/CodeGen/X86/sqrt-fastmath-mir.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath-mir.ll +++ test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -7,16 +7,16 @@ ; CHECK: body: ; CHECK: %0:fr32 = COPY $xmm0 ; CHECK: %1:fr32 = VRSQRTSSr killed %2, %0 -; CHECK: %3:fr32 = reassoc VMULSSrr %0, %1 +; CHECK: %3:fr32 = VMULSSrr %0, %1 ; CHECK: %4:fr32 = VMOVSSrm ; CHECK: %5:fr32 = VFMADD213SSr %1, killed %3, %4 ; CHECK: %6:fr32 = VMOVSSrm -; CHECK: %7:fr32 = reassoc VMULSSrr %1, %6 -; CHECK: %8:fr32 = reassoc VMULSSrr killed %7, killed %5 -; CHECK: %9:fr32 = reassoc VMULSSrr %0, %8 +; CHECK: %7:fr32 = VMULSSrr %1, %6 +; CHECK: %8:fr32 = VMULSSrr killed %7, killed %5 +; CHECK: %9:fr32 = VMULSSrr %0, %8 ; CHECK: %10:fr32 = VFMADD213SSr %8, %9, %4 -; CHECK: %11:fr32 = reassoc VMULSSrr %9, %6 -; CHECK: %12:fr32 = reassoc VMULSSrr killed %11, killed %10 +; CHECK: %11:fr32 = VMULSSrr %9, %6 +; CHECK: %12:fr32 = VMULSSrr killed %11, killed %10 ; CHECK: %14:fr32 = FsFLD0SS ; CHECK: %15:fr32 = VCMPSSrr %0, killed %14, 0 ; CHECK: %17:vr128 = VANDNPSrr killed %16, killed %13 @@ -33,12 +33,12 @@ ; CHECK: %1:fr32 = VRSQRTSSr killed %2, %0 ; CHECK: %3:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %0, %1 ; CHECK: %4:fr32 = VMOVSSrm -; CHECK: %5:fr32 = VFMADD213SSr %1, killed %3, %4 +; CHECK: %5:fr32 = nnan ninf nsz arcp contract afn reassoc VFMADD213SSr %1, killed %3, %4 ; CHECK: %6:fr32 = VMOVSSrm ; CHECK: %7:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %1, %6 ; CHECK: %8:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr killed %7, killed %5 ; CHECK: %9:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %0, %8 -; CHECK: %10:fr32 = VFMADD213SSr %8, killed %9, %4 +; CHECK: %10:fr32 = nnan ninf nsz arcp contract afn reassoc VFMADD213SSr %8, killed %9, %4 ; CHECK: %11:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %8, %6 ; CHECK: %12:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr killed %11, killed %10 ; CHECK: $xmm0 = COPY %12