Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9627,8 +9627,9 @@ return SDValue(); SDNodeFlags Flags = N->getFlags(); + bool CanFuse = Options.UnsafeFPMath || isContractable(N); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + CanFuse || HasFMAD); // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); @@ -9700,9 +9701,7 @@ // More folding opportunities when target permits. if (Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && + if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { @@ -9715,9 +9714,7 @@ } // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && + if (CanFuse && N1->getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FMUL && N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) { @@ -9841,8 +9838,9 @@ return SDValue(); const SDNodeFlags Flags = N->getFlags(); + bool CanFuse = Options.UnsafeFPMath || isContractable(N); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + CanFuse || HasFMAD); // If the subtraction is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) @@ -9873,11 +9871,12 @@ // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. - if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0, Flags); + } // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && @@ -9973,9 +9972,7 @@ if (Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && + if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -9989,9 +9986,7 @@ // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) - // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF - // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && + if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); @@ -10611,6 +10606,7 @@ // FMA nodes have flags that propagate to the created nodes. const SDNodeFlags Flags = N->getFlags(); + bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N); // Constant fold FMA. if (isa(N0) && @@ -10619,7 +10615,7 @@ return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); } - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) @@ -10636,7 +10632,7 @@ !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && isConstantFPBuildVectorOrConstantFP(N1) && @@ -10682,7 +10678,7 @@ } } - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, Index: test/CodeGen/AArch64/fma-aggressive.ll =================================================================== --- test/CodeGen/AArch64/fma-aggressive.ll +++ test/CodeGen/AArch64/fma-aggressive.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s | FileCheck %s + +define float @test1(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test1 +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %u, %v + %mul.2 = fmul fast float %x, %y + %fma = fadd fast float %mul.2, %mul.1 + %res = fadd fast float %fma, %z + ret float %res +} + +define float @test2(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test2 +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} +; CHECK-EVEN: fmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %y, %z + %mul.2 = fmul fast float %u, %v + %fma = fadd fast float %mul.2, %mul.1 + %res = fadd fast float %x, %fma + ret float %res +} + +define float @test3(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test3 +; CHECK-EVEN: fnmsub {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %x, %y + %res = fsub fast float %mul.1, %z + ret float %res +} + +define float @test4(float %u , float %v , float %x, float %y, float %z) { +; CHECK-LABEL: test4 +; CHECK-EVEN: fnmadd {{s[0-9]*[02468]}}, {{s[0-9]*}}, {{s[0-9]*}}, {{s[0-9]*[02468]}} + %mul.1 = fmul fast float %x, %y + %neg = fsub fast float -0.0, %mul.1 + %res = fsub fast float %neg, %z + ret float %res +} Index: test/CodeGen/AArch64/neon-fma-FMF.ll =================================================================== --- test/CodeGen/AArch64/neon-fma-FMF.ll +++ test/CodeGen/AArch64/neon-fma-FMF.ll @@ -1,13 +1,23 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s -define <2 x float> @fma(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: fma: +define <2 x float> @fma_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_1: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp1 = fmul contract <2 x float> %A, %B; %tmp2 = fadd contract <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } +; This case will fold as it was only available through unsafe before, now available from +; the contract on the fadd +define <2 x float> @fma_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_2: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fadd contract <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + define <2 x float> @no_fma_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { ; CHECK-LABEL: no_fma_1: ; CHECK: fmul @@ -17,19 +27,20 @@ ret <2 x float> %tmp2 } -define <2 x float> @no_fma_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: no_fma_2: -; CHECK: fmul -; CHECK: fadd - %tmp1 = fmul <2 x float> %A, %B; - %tmp2 = fadd contract <2 x float> %C, %tmp1; +define <2 x float> @fma_sub_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_sub_1: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = fmul contract <2 x float> %A, %B; + %tmp2 = fsub contract <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } -define <2 x float> @fma_sub(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: fma_sub: +; This case will fold as it was only available through unsafe before, now available from +; the contract on the fsub +define <2 x float> @fma_sub_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_sub_2: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - %tmp1 = fmul contract <2 x float> %A, %B; + %tmp1 = fmul <2 x float> %A, %B; %tmp2 = fsub contract <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } @@ -42,12 +53,3 @@ %tmp2 = fsub <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } - -define <2 x float> @no_fma_sub_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -; CHECK-LABEL: no_fma_sub_2: -; CHECK: fmul -; CHECK: fsub - %tmp1 = fmul <2 x float> %A, %B; - %tmp2 = fsub contract <2 x float> %C, %tmp1; - ret <2 x float> %tmp2 -} Index: test/CodeGen/PowerPC/fma-aggr-FMF.ll =================================================================== --- test/CodeGen/PowerPC/fma-aggr-FMF.ll +++ test/CodeGen/PowerPC/fma-aggr-FMF.ll @@ -22,10 +22,10 @@ define float @no_fma_with_fewer_uses(float %f1, float %f2, float %f3, float %f4) { ; CHECK-LABEL: no_fma_with_fewer_uses: ; CHECK: # %bb.0: -; CHECK-NEXT: xsmulsp 0, 3, 4 -; CHECK-NEXT: xsmulsp 13, 1, 2 -; CHECK-NEXT: xsmaddasp 0, 1, 2 -; CHECK-NEXT: xsdivsp 1, 13, 0 +; CHECK-NEXT: xsmulsp 0, 1, 2 +; CHECK-NEXT: fmr 1, 0 +; CHECK-NEXT: xsmaddasp 1, 3, 4 +; CHECK-NEXT: xsdivsp 1, 0, 1 ; CHECK-NEXT: blr %mul1 = fmul contract float %f1, %f2 %mul2 = fmul float %f3, %f4 Index: test/CodeGen/PowerPC/fmf-propagation.ll =================================================================== --- test/CodeGen/PowerPC/fmf-propagation.ll +++ test/CodeGen/PowerPC/fmf-propagation.ll @@ -15,15 +15,14 @@ ; X * Y + Z --> fma(X, Y, Z) ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_contract1:' -; FMFDEBUG: fmul {{t[0-9]+}}, {{t[0-9]+}} -; FMFDEBUG: fadd contract {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma contract {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_contract1:' define float @fmul_fadd_contract1(float %x, float %y, float %z) { ; FMF-LABEL: fmul_fadd_contract1: ; FMF: # %bb.0: -; FMF-NEXT: xsmulsp 0, 1, 2 -; FMF-NEXT: xsaddsp 1, 0, 3 +; FMF-NEXT: xsmaddasp 3, 1, 2 +; FMF-NEXT: fmr 1, 3 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fadd_contract1: @@ -62,15 +61,14 @@ ; Reassociation implies that FMA contraction is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fadd_reassoc1:' -; FMFDEBUG: fmul {{t[0-9]+}}, {{t[0-9]+}} -; FMFDEBUG: fadd reassoc {{t[0-9]+}}, {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}}, {{t[0-9]+}}, {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fadd_reassoc1:' define float @fmul_fadd_reassoc1(float %x, float %y, float %z) { ; FMF-LABEL: fmul_fadd_reassoc1: ; FMF: # %bb.0: -; FMF-NEXT: xsmulsp 0, 1, 2 -; FMF-NEXT: xsaddsp 1, 0, 3 +; FMF-NEXT: xsmaddasp 3, 1, 2 +; FMF-NEXT: fmr 1, 3 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fadd_reassoc1: @@ -156,7 +154,7 @@ ; This is the minimum FMF needed for this transform - the FMA allows reassociation. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' -; FMFDEBUG: fma reassoc {{t[0-9]+}} +; FMFDEBUG: fmul reassoc {{t[0-9]+}}, ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' @@ -169,12 +167,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI6_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI6_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI6_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI6_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_reassoc1: @@ -193,7 +186,6 @@ ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' ; FMFDEBUG: fmul reassoc {{t[0-9]+}} -; FMFDEBUG: fma reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' @@ -206,12 +198,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI7_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI7_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI7_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI7_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_reassoc2: @@ -229,7 +216,7 @@ ; The FMA is now fully 'fast'. This implies that reassociation is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' -; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} +; FMFDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' @@ -242,12 +229,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI8_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI8_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI8_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI8_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_fast1: @@ -266,7 +248,6 @@ ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' ; FMFDEBUG: fmul nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} -; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' @@ -279,12 +260,7 @@ ; FMF-NEXT: addis 3, 2, .LCPI9_0@toc@ha ; FMF-NEXT: addi 3, 3, .LCPI9_0@toc@l ; FMF-NEXT: lfsx 0, 0, 3 -; FMF-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; FMF-NEXT: addi 3, 3, .LCPI9_1@toc@l -; FMF-NEXT: lfsx 2, 0, 3 -; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: xsmaddasp 0, 1, 2 -; FMF-NEXT: fmr 1, 0 +; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: fmul_fma_fast2: Index: test/CodeGen/X86/fmf-flags_fma.ll =================================================================== --- test/CodeGen/X86/fmf-flags_fma.ll +++ test/CodeGen/X86/fmf-flags_fma.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mattr=+avx2,+fma -mtriple=x86_64-apple-macosx10.8.0 | FileCheck %s -check-prefix=X64 + +declare float @llvm.fma.f32(float %a, float %b, float %c); + +define float @fast_fmuladd_rep1(float %a , float %b , float %c) { +; X64-LABEL: fast_fmuladd_rep1: +; X64: # %bb.0: +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 + %mul.1 = fmul fast float %a, %b + %res = fadd fast float %mul.1, %c + ret float %res +} + +define float @fast_fmuladd_rep2(float %a , float %b , float %c) { +; X64-LABEL: fast_fmuladd_rep2: +; X64: # %bb.0: +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 + %mul.1 = fmul fast float %a, %b + %res = fadd fast float %c, %mul.1 + ret float %res +} + +define float @fast_fmuladd_rep3(half %a , half %b , float %c) { +; X64-LABEL: fast_fmuladd_rep3: +; X64: # %bb.0: +; X64: vfmadd213ss {{[0-9]+}}(%rsp), %xmm1, %xmm0 + %mul.1 = fmul fast half %a, %b + %ext = fpext half %mul.1 to float + %res = fadd fast float %ext, %c + ret float %res +} + +define float @fast_fmuladd_rep4(half %a , half %b , float %c) { +; X64-LABEL: fast_fmuladd_rep4: +; X64: # %bb.0: +; X64: vfmadd213ss {{[0-9]+}}(%rsp), %xmm1, %xmm0 + %mul.1 = fmul fast half %a, %b + %ext = fpext half %mul.1 to float + %res = fadd fast float %c, %ext + ret float %res +}