Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -889,7 +889,8 @@ SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, SDValue N2, const SDNodeFlags Flags = SDNodeFlags()); SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2, SDValue N3); + SDValue N2, SDValue N3, + const SDNodeFlags Flags = SDNodeFlags()); SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, SDValue N2, SDValue N3, SDValue N4); SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9578,8 +9578,10 @@ if (!HasFMAD && !HasFMA) return SDValue(); + SDNodeFlags Flags = N->getFlags(); + bool CanFuse = Options.UnsafeFPMath || isContractable(N); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + CanFuse || HasFMAD); // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); @@ -9609,14 +9611,14 @@ // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), N1); + N0.getOperand(0), N0.getOperand(1), N1, Flags); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, - N1.getOperand(0), N1.getOperand(1), N0); + N1.getOperand(0), N1.getOperand(1), N0, Flags); } // Look through FP_EXTEND nodes to do more combining. @@ -9630,7 +9632,7 @@ DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), N1); + N00.getOperand(1)), N1, Flags); } } @@ -9644,16 +9646,14 @@ DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(1)), N0); + N10.getOperand(1)), N0, Flags); } } // More folding opportunities when target permits. if (Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && + if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { @@ -9662,13 +9662,11 @@ DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), - N1)); + N1, Flags), Flags); } // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && + if (CanFuse && N1->getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FMUL && N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) { @@ -9677,19 +9675,19 @@ DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(2).getOperand(0), N1.getOperand(2).getOperand(1), - N0)); + N0, Flags), Flags); } - // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, + SDNodeFlags Flags) { return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, U), DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z)); + Z, Flags), Flags); }; if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); @@ -9699,7 +9697,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), - N1); + N1, Flags); } } } @@ -9710,14 +9708,15 @@ // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. auto FoldFAddFPExtFMAFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, + SDNodeFlags Flags) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X), DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, U), DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z)); + Z, Flags), Flags); }; if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); @@ -9727,7 +9726,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), - N1); + N1, Flags); } } } @@ -9742,7 +9741,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), - N0); + N0, Flags); } } } @@ -9760,7 +9759,7 @@ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), - N0); + N0, Flags); } } } @@ -9789,8 +9788,11 @@ if (!HasFMAD && !HasFMA) return SDValue(); + const SDNodeFlags Flags = N->getFlags(); + bool CanFuse = Options.UnsafeFPMath || isContractable(N); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + CanFuse || HasFMAD); + // If the subtraction is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); @@ -9815,16 +9817,17 @@ if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), - DAG.getNode(ISD::FNEG, SL, VT, N1)); + DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); } // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. - if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), - N1.getOperand(1), N0); + N1.getOperand(1), N0, Flags); + } // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && @@ -9833,7 +9836,7 @@ SDValue N01 = N0.getOperand(0).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N00), N01, - DAG.getNode(ISD::FNEG, SL, VT, N1)); + DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); } // Look through FP_EXTEND nodes to do more combining. @@ -9849,7 +9852,7 @@ N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, N1)); + DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); } } @@ -9866,7 +9869,7 @@ N10.getOperand(0))), DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), - N0); + N0, Flags); } } @@ -9888,7 +9891,7 @@ N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), - N1)); + N1, Flags)); } } } @@ -9911,7 +9914,7 @@ N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), - N1)); + N1, Flags)); } } } @@ -9920,9 +9923,7 @@ if (Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && + if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -9931,14 +9932,12 @@ N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, - N1))); + N1), Flags), Flags); } // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && + if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); @@ -9948,11 +9947,9 @@ N1.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N20), - - N21, N0)); + N21, N0, Flags), Flags); } - // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { @@ -9969,7 +9966,7 @@ DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, - N1))); + N1), Flags), Flags); } } } @@ -9997,7 +9994,7 @@ DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, - N1))); + N1), Flags), Flags); } } } @@ -10020,7 +10017,7 @@ VT, N1200)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), - N0)); + N0, Flags), Flags); } } @@ -10051,7 +10048,7 @@ VT, N1020)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), - N0)); + N0, Flags), Flags); } } } @@ -10067,6 +10064,7 @@ SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); + const SDNodeFlags Flags = N->getFlags(); assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); @@ -10098,52 +10096,54 @@ // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) - auto FuseFADD = [&](SDValue X, SDValue Y) { + auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); if (XC1 && XC1->isExactlyValue(+1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + Y, Flags); if (XC1 && XC1->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y)); + DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); } return SDValue(); }; - if (SDValue FMA = FuseFADD(N0, N1)) + if (SDValue FMA = FuseFADD(N0, N1, Flags)) return FMA; - if (SDValue FMA = FuseFADD(N1, N0)) + if (SDValue FMA = FuseFADD(N1, N0, Flags)) return FMA; // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) - auto FuseFSUB = [&](SDValue X, SDValue Y) { + auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); if (XC0 && XC0->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - Y); + Y, Flags); if (XC0 && XC0->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y)); + DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); if (XC1 && XC1->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y)); + DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); if (XC1 && XC1->isExactlyValue(-1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + Y, Flags); } return SDValue(); }; - if (SDValue FMA = FuseFSUB(N0, N1)) + if (SDValue FMA = FuseFSUB(N0, N1, Flags)) return FMA; - if (SDValue FMA = FuseFSUB(N1, N0)) + if (SDValue FMA = FuseFSUB(N1, N0, Flags)) return FMA; return SDValue(); @@ -10554,6 +10554,10 @@ SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; + // FMA nodes have flags that propagate to the created nodes. + const SDNodeFlags Flags = N->getFlags(); + bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N); + // Constant fold FMA. if (isa(N0) && isa(N1) && @@ -10561,7 +10565,7 @@ return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); } - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) @@ -10578,12 +10582,7 @@ !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); - // TODO: FMA nodes should have flags that propagate to the created nodes. - // For now, create a Flags object for use with reassociation math transforms. - SDNodeFlags Flags; - Flags.setAllowReassociation(true); - - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && isConstantFPBuildVectorOrConstantFP(N1) && @@ -10629,7 +10628,7 @@ } } - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4859,7 +4859,8 @@ } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue N1, SDValue N2, SDValue N3) { + SDValue N1, SDValue N2, SDValue N3, + const SDNodeFlags Flags) { // Perform various simplifications. switch (Opcode) { case ISD::FMA: { @@ -4953,10 +4954,13 @@ FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; - if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) + if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) { + E->intersectFlagsWith(Flags); return SDValue(E, 0); + } N = newSDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); + N->setFlags(Flags); createOperands(N, Ops); CSEMap.InsertNode(N, IP); } else { Index: test/CodeGen/AArch64/fma-aggressive.ll =================================================================== --- test/CodeGen/AArch64/fma-aggressive.ll +++ test/CodeGen/AArch64/fma-aggressive.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s | FileCheck %s + +define float @test1(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test1 +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %u, %v + %mul.2 = fmul fast float %x, %y + %fma = fadd fast float %mul.2, %mul.1 + %res = fadd fast float %fma, %z + ret float %res +} + +define float @test2(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test2 +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %y, %z + %mul.2 = fmul fast float %u, %v + %fma = fadd fast float %mul.2, %mul.1 + %res = fadd fast float %x, %fma + ret float %res +} + +define float @test3(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test3 +; CHECK-EVEN: fnmsub {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %x, %y + %res = fsub fast float %mul.1, %z + ret float %res +} + +define float @test4(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test4 +; CHECK-EVEN: fnmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %x, %y + %neg = fsub fast float -0.0, %mul.1 + %res = fsub fast float %neg, %z + ret float %res +} Index: test/CodeGen/AArch64/neon-fma-FMF.ll =================================================================== --- test/CodeGen/AArch64/neon-fma-FMF.ll +++ test/CodeGen/AArch64/neon-fma-FMF.ll @@ -1,13 +1,23 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s -define <2 x float> @fma(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: fma: +define <2 x float> @fma_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_1: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp1 = fmul contract <2 x float> %A, %B; %tmp2 = fadd contract <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } +; This case will fold as it was only available through unsafe before, now available from +; the contract on the fadd +define <2 x float> @fma_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_2: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fadd contract <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + define <2 x float> @no_fma_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { ; CHECK-LABEL: no_fma_1: ; CHECK: fmul @@ -17,19 +27,20 @@ ret <2 x float> %tmp2 } -define <2 x float> @no_fma_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: no_fma_2: -; CHECK: fmul -; CHECK: fadd - %tmp1 = fmul <2 x float> %A, %B; - %tmp2 = fadd contract <2 x float> %C, %tmp1; +define <2 x float> @fma_sub_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_sub_1: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = fmul contract <2 x float> %A, %B; + %tmp2 = fsub contract <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } -define <2 x float> @fma_sub(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: fma_sub: +; This case will fold as it was only available through unsafe before, now available from +; the contract on the fsub +define <2 x float> @fma_sub_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_sub_2: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - %tmp1 = fmul contract <2 x float> %A, %B; + %tmp1 = fmul <2 x float> %A, %B; %tmp2 = fsub contract <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } @@ -42,12 +53,3 @@ %tmp2 = fsub <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } - -define <2 x float> @no_fma_sub_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: no_fma_sub_2: -; CHECK: fmul -; CHECK: fsub - %tmp1 = fmul <2 x float> %A, %B; - %tmp2 = fsub contract <2 x float> %C, %tmp1; - ret <2 x float> %tmp2 -} Index: test/CodeGen/PowerPC/fma-aggr-FMF.ll =================================================================== --- test/CodeGen/PowerPC/fma-aggr-FMF.ll +++ test/CodeGen/PowerPC/fma-aggr-FMF.ll @@ -22,10 +22,10 @@ define float @no_fma_with_fewer_uses(float %f1, float %f2, float %f3, float %f4) { ; CHECK-LABEL: no_fma_with_fewer_uses: ; CHECK: # %bb.0: -; CHECK-NEXT: xsmulsp 0, 3, 4 -; CHECK-NEXT: xsmulsp 13, 1, 2 -; CHECK-NEXT: xsmaddasp 0, 1, 2 -; CHECK-NEXT: xsdivsp 1, 13, 0 +; CHECK-NEXT: xsmulsp 0, 1, 2 +; CHECK-NEXT: fmr 1, 0 +; CHECK-NEXT: xsmaddasp 1, 3, 4 +; CHECK-NEXT: xsdivsp 1, 0, 1 ; CHECK-NEXT: blr %mul1 = fmul contract float %f1, %f2 %mul2 = fmul float %f3, %f4 Index: test/CodeGen/PowerPC/fmf-propagation.ll =================================================================== --- test/CodeGen/PowerPC/fmf-propagation.ll +++ test/CodeGen/PowerPC/fmf-propagation.ll @@ -15,15 +15,14 @@ ; X * Y + Z --> fma(X, Y, Z) ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_contract1:' -; FMFDEBUG: fmul {{t[0-9]+}}, {{t[0-9]+}} -; FMFDEBUG: fadd contract {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma contract {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_contract1:' define float @fmul_fadd_contract1(float %x, float %y, float %z) { ; FMF-LABEL: fmul_fadd_contract1: ; FMF: # %bb.0: -; FMF-NEXT: xsmulsp 0, 1, 2 -; FMF-NEXT: xsaddsp 1, 0, 3 +; FMF-NEXT: xsmaddasp 3, 1, 2 +; FMF-NEXT: fmr 1, 3 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fadd_contract1: @@ -39,7 +38,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_contract2:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma contract {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_contract2:' define float @fmul_fadd_contract2(float %x, float %y, float %z) { @@ -62,15 +61,14 @@ ; Reassociation implies that FMA contraction is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_reassoc1:' -; FMFDEBUG: fmul {{t[0-9]+}}, {{t[0-9]+}} -; FMFDEBUG: fadd reassoc {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_reassoc1:' define float @fmul_fadd_reassoc1(float %x, float %y, float %z) { ; FMF-LABEL: fmul_fadd_reassoc1: ; FMF: # %bb.0: -; FMF-NEXT: xsmulsp 0, 1, 2 -; FMF-NEXT: xsaddsp 1, 0, 3 +; FMF-NEXT: xsmaddasp 3, 1, 2 +; FMF-NEXT: fmr 1, 3 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fadd_reassoc1: @@ -86,7 +84,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_reassoc2:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_reassoc2:' define float @fmul_fadd_reassoc2(float %x, float %y, float %z) { @@ -109,7 +107,7 @@ ; The fadd is now fully 'fast'. This implies that contraction is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_fast1:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_fast1:' define float @fmul_fadd_fast1(float %x, float %y, float %z) { @@ -132,7 +130,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_fast2:' -; FMFDEBUG: fma {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_fast2:' define float @fmul_fadd_fast2(float %x, float %y, float %z) { @@ -156,7 +154,7 @@ ; This is the minimum FMF needed for this transform - the FMA allows reassociation. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' -; FMFDEBUG: fma reassoc {{t[0-9]+}} +; FMFDEBUG: fmul reassoc {{t[0-9]+}}, ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' @@ -169,12 +167,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI6_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI6_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI6_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI6_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_reassoc1: @@ -192,7 +185,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' -; FMFDEBUG: fma reassoc {{t[0-9]+}} +; FMFDEBUG: fmul reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' @@ -205,12 +198,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI7_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI7_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI7_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI7_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_reassoc2: @@ -228,11 +216,11 @@ ; The FMA is now fully 'fast'. This implies that reassociation is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' -; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} +; FMFDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' -; GLOBALDEBUG: fmul reassoc {{t[0-9]+}} +; GLOBALDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:' define float @fmul_fma_fast1(float %x) { @@ -241,12 +229,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI8_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI8_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI8_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI8_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_fast1: @@ -264,11 +247,11 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' -; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} +; FMFDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' -; GLOBALDEBUG: fmul reassoc {{t[0-9]+}} +; GLOBALDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:' define float @fmul_fma_fast2(float %x) { @@ -277,12 +260,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI9_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI9_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI9_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_fast2: Index: test/CodeGen/X86/fmf-flags_fma.ll =================================================================== --- test/CodeGen/X86/fmf-flags_fma.ll +++ test/CodeGen/X86/fmf-flags_fma.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mattr=+avx2,+fma -mtriple=x86_64-apple-macosx10.8.0 | FileCheck %s -check-prefix=X64 + +declare float @llvm.fma.f32(float %a, float %b, float %c); + +define float @fast_fmuladd_rep1(float %a , float %b , float %c) { +; X64-LABEL: fast_fmuladd_rep1: +; X64: # %bb.0: +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 + %mul.1 = fmul fast float %a, %b + %res = fadd fast float %mul.1, %c + ret float %res +} + +define float @fast_fmuladd_rep2(float %a , float %b , float %c) { +; X64-LABEL: fast_fmuladd_rep2: +; X64: # %bb.0: +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 + %mul.1 = fmul fast float %a, %b + %res = fadd fast float %c, %mul.1 + ret float %res +} + +define float @fast_fmuladd_rep3(half %a , half %b , float %c) { +; X64-LABEL: fast_fmuladd_rep3: +; X64: # %bb.0: +; X64: vfmadd213ss {{[0-9]+}}(%rsp), %xmm1, %xmm0 + %mul.1 = fmul fast half %a, %b + %ext = fpext half %mul.1 to float + %res = fadd fast float %ext, %c + ret float %res +} + +define float @fast_fmuladd_rep4(half %a , half %b , float %c) { +; X64-LABEL: fast_fmuladd_rep4: +; X64: # %bb.0: +; X64: vfmadd213ss {{[0-9]+}}(%rsp), %xmm1, %xmm0 + %mul.1 = fmul fast half %a, %b + %ext = fpext half %mul.1 to float + %res = fadd fast float %c, %ext + ret float %res +} Index: test/CodeGen/X86/sqrt-fastmath-mir.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath-mir.ll +++ test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -9,12 +9,12 @@ ; CHECK: %1:fr32 = VRSQRTSSr killed %2, %0 ; CHECK: %3:fr32 = reassoc VMULSSrr %0, %1 ; CHECK: %4:fr32 = VMOVSSrm -; CHECK: %5:fr32 = VFMADD213SSr %1, killed %3, %4 +; CHECK: %5:fr32 = reassoc VFMADD213SSr %1, killed %3, %4 ; CHECK: %6:fr32 = VMOVSSrm ; CHECK: %7:fr32 = reassoc VMULSSrr %1, %6 ; CHECK: %8:fr32 = reassoc VMULSSrr killed %7, killed %5 ; CHECK: %9:fr32 = reassoc VMULSSrr %0, %8 -; CHECK: %10:fr32 = VFMADD213SSr %8, %9, %4 +; CHECK: %10:fr32 = reassoc VFMADD213SSr %8, %9, %4 ; CHECK: %11:fr32 = reassoc VMULSSrr %9, %6 ; CHECK: %12:fr32 = reassoc VMULSSrr killed %11, killed %10 ; CHECK: %14:fr32 = FsFLD0SS @@ -33,12 +33,12 @@ ; CHECK: %1:fr32 = VRSQRTSSr killed %2, %0 ; CHECK: %3:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %0, %1 ; CHECK: %4:fr32 = VMOVSSrm -; CHECK: %5:fr32 = VFMADD213SSr %1, killed %3, %4 +; CHECK: %5:fr32 = nnan ninf nsz arcp contract afn reassoc VFMADD213SSr %1, killed %3, %4 ; CHECK: %6:fr32 = VMOVSSrm ; CHECK: %7:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %1, %6 ; CHECK: %8:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr killed %7, killed %5 ; CHECK: %9:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %0, %8 -; CHECK: %10:fr32 = VFMADD213SSr %8, killed %9, %4 +; CHECK: %10:fr32 = nnan ninf nsz arcp contract afn reassoc VFMADD213SSr %8, killed %9, %4 ; CHECK: %11:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr %8, %6 ; CHECK: %12:fr32 = nnan ninf nsz arcp contract afn reassoc VMULSSrr killed %11, killed %10 ; CHECK: $xmm0 = COPY %12