diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11957,6 +11957,7 @@ // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros(); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. @@ -12120,7 +12121,7 @@ // -> (fma (fneg y), z, (fma (fneg u), v, x)) if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N1.getOperand(2)) && - N1->hasOneUse()) { + N1->hasOneUse() && NoSignedZero) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -2,6 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs + ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math ; for correctness. @@ -376,9 +378,10 @@ %u = load volatile double, double addrspace(1)* %gep.3 %v = load volatile double, double addrspace(1)* %gep.4 - %tmp0 = fmul double %u, %v - %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 - %tmp2 = fsub double %x, %tmp1 + ; nsz flag is needed since this combine may change sign of zero + %tmp0 = fmul nsz double %u, %v + %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 + %tmp2 = fsub nsz double %x, %tmp1 store double %tmp2, double addrspace(1)* %gep.out ret void diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll --- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -4,6 +4,8 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s +; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs + ; Make sure we don't form mad with denormals ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s @@ -566,9 +568,10 @@ %u = load volatile float, float addrspace(1)* %gep.3 %v = load volatile float, float addrspace(1)* %gep.4 - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 - %tmp2 = fsub float %x, %tmp1 + ; nsz flag is needed since this combine may change sign of zero + %tmp0 = fmul nsz float %u, %v + %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub nsz float %x, %tmp1 store float %tmp2, float addrspace(1)* %gep.out ret void diff --git a/llvm/test/CodeGen/PowerPC/fma-assoc.ll b/llvm/test/CodeGen/PowerPC/fma-assoc.ll --- a/llvm/test/CodeGen/PowerPC/fma-assoc.ll +++ b/llvm/test/CodeGen/PowerPC/fma-assoc.ll @@ -331,15 +331,16 @@ define double @test_reassoc_FMSUB_ASSOC2(double %A, double %B, double %C, ; CHECK-LABEL: test_reassoc_FMSUB_ASSOC2: ; CHECK: # %bb.0: -; CHECK-NEXT: fnmsub 0, 3, 4, 5 -; CHECK-NEXT: fnmsub 1, 1, 2, 0 +; CHECK-NEXT: fmul 0, 3, 4 +; CHECK-NEXT: fmadd 0, 1, 2, 0 +; CHECK-NEXT: fsub 1, 5, 0 ; CHECK-NEXT: blr ; ; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC2: ; CHECK-VSX: # %bb.0: -; CHECK-VSX-NEXT: xsnmsubmdp 3, 4, 5 -; CHECK-VSX-NEXT: xsnmsubadp 3, 1, 2 -; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: xsmuldp 0, 3, 4 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: xssubdp 1, 5, 0 ; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul reassoc double %A, %B ; [#uses=1]