Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2642,11 +2642,13 @@
     return false;
   }
 
-  /// Returns true if the FADD or FSUB node passed could legally be combined with
-  /// an fmul to form an ISD::FMAD.
-  virtual bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                                      const SDNode *N) const {
-    assert(N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB);
+  /// Returns true if be combined with to form an ISD::FMAD. \p N may be an
+  /// ISD::FADD, ISD::FSUB, or an ISD::FMUL which will be distributed into an
+  /// fadd/fsub.
+  virtual bool isFMADLegal(const SelectionDAG &DAG,
+                           const SDNode *N) const {
+    assert(N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB ||
+           N->getOpcode() == ISD::FMUL);
     return isOperationLegal(ISD::FMAD, N->getValueType(0));
   }
 
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11728,7 +11728,7 @@
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -11945,7 +11945,7 @@
 
   const TargetOptions &Options = DAG.getTarget().Options;
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -12289,7 +12289,7 @@
   // Floating-point multiply-add with intermediate rounding. This can result
   // in a less precise result due to the changed rounding order.
   bool HasFMAD = Options.UnsafeFPMath &&
-                 (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+                 (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
Index: llvm/lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -358,8 +358,8 @@
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
-  bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                              const SDNode *N) const override;
+  bool isFMADLegal(const SelectionDAG &DAG,
+                   const SDNode *N) const override;
 
   SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3946,8 +3946,8 @@
   return false;
 }
 
-bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                                              const SDNode *N) const {
+bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
+                                   const SDNode *N) const {
   // TODO: Check future ftz flag
   // v_mad_f32/v_mac_f32 do not support denormals.
   EVT VT = N->getValueType(0);
Index: llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,FMA %s
+; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,FMA %s
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+
+; Check for incorrect fmad formation when distributing
+
+define float @unsafe_fmul_fadd_distribute_fast_f32(float %arg0, float %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v1, v0, v0
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mac_f32_e32 v0, v1, v0
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast float %arg1, 1.0
+  %tmp1 = fmul fast float %arg0, %add
+  ret float %tmp1
+}
+
+define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v1, v0, v0
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v1, 1.0, v1
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v1, v0, v0
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fsub fast float 1.0, %arg1
+  %tmp1 = fmul fast float %arg0, %add
+  ret float %tmp1
+}
+
+define <2 x float> @unsafe_fmul_fadd_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v2, v0, v0
+; FMA-NEXT:    v_fma_f32 v1, v3, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; NOFUSE-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mac_f32_e32 v0, v2, v0
+; FMAD-NEXT:    v_mac_f32_e32 v1, v3, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast <2 x float> %arg1, <float 1.0, float 1.0>
+  %tmp1 = fmul fast <2 x float> %arg0, %add
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v2, v0, v0
+; FMA-NEXT:    v_fma_f32 v1, -v3, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v3, 1.0, v3
+; NOFUSE-NEXT:    v_sub_f32_e32 v2, 1.0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v2, v0, v0
+; FMAD-NEXT:    v_mad_f32 v1, -v3, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fsub  fast <2 x float> <float 1.0, float 1.0>, %arg1
+  %tmp1 = fmul fast <2 x float> %arg0, %add
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fast_fmul_fadd_distribute_post_legalize_f32(float %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, v0, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast float %arg0, 1.0
+  %splat = insertelement <2 x float> undef, float %add, i32 0
+  %tmp1 = fmul fast <2 x float> %arg1, %splat
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v0, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v0, 1.0, v0
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v0, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %sub = fsub fast float 1.0, %arg0
+  %splat = insertelement <2 x float> undef, float %sub, i32 0
+  %tmp1 = fmul fast <2 x float> %arg1, %splat
+  ret <2 x float> %tmp1
+}
+
+attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }