diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15233,6 +15233,13 @@ if (!AllowFusionGlobally && !N->getFlags().hasAllowContract()) return SDValue(); + // Folding fadd (fmul x, y), (fmul x, y) -> fma x, y, (fmul x, y) is never + // beneficial. It does not reduce latency. It increases register pressure. It + // replaces an fadd with an fma which is a more complex instruction, so is + // likely to have a larger encoding, use more functional units, etc. + if (N0 == N1) + return SDValue(); + if (TLI.generateFMAsInMachineCombiner(VT, OptLevel)) return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -277,9 +277,7 @@ define amdgpu_ps float @fma_vs_output_modifier_2(float %x) #0 { ; GCN-LABEL: fma_vs_output_modifier_2: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e32 v1, v0, v0 -; GCN-NEXT: v_fmac_f32_e32 v1, v0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_mul_f32_e64 v0, v0, v0 mul:2 ; GCN-NEXT: ; return to shader part epilog %m = fmul contract float %x, %x %a = fadd nsz contract float %m, %m