Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
===================================================================
--- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -109,6 +109,7 @@
 
   /// \return true if the combine is running prior to legalization, or if \p
   /// Query is legal on the target.
+  bool isLegal(const LegalityQuery &Query) const;
   bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const;
 
   /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
@@ -329,6 +330,14 @@
   bool matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src);
   bool applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src);
 
+  /// Transform (fadd (fmul x, y), z) -> (fma x, y, z)
+  bool matchCombineFaddFmulToFmadOrFma(
+      MachineInstr &MI,
+      std::tuple<Register, Register, Register, unsigned> &MatchInfo);
+  bool applyCombineFaddFmulToFmadOrFma(
+      MachineInstr &MI,
+      std::tuple<Register, Register, Register, unsigned> &MatchInfo);
+
   /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
   bool matchCombineTruncOfExt(MachineInstr &MI,
                               std::pair<Register, unsigned> &MatchInfo);
@@ -538,6 +547,7 @@
       SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
       const SmallVector<Register, 8> &RegsToVisit,
       const unsigned MemSizeInBits);
+  bool isContractableFMUL(const MachineInstr &MI, bool AllowFusionGlobally);
 };
 } // namespace llvm
 
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -792,6 +792,20 @@
     return false;
   }
 
+  virtual bool enableAggressiveFMAFusion(LLT Ty) const {
+    switch (Ty.getSizeInBits()) {
+    case 16:
+      return enableAggressiveFMAFusion(EVT(MVT::f16));
+    case 32:
+      return enableAggressiveFMAFusion(EVT(MVT::f32));
+    case 64:
+      return enableAggressiveFMAFusion(EVT(MVT::f64));
+    default:
+      break;
+    }
+    return false;
+  }
+
   /// Return the ValueType of the result of SETCC operations.
   virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                                  EVT VT) const;
@@ -1097,6 +1111,21 @@
     return getOperationAction(EqOpc, VT);
   }
 
+  bool isOperationLegalOrCustom(unsigned Op, const LLT Ty,
+                                bool LegalOnly = false) const {
+    switch (Ty.getSizeInBits()) {
+    case 16:
+      return isOperationLegalOrCustom(Op, EVT(MVT::f16), LegalOnly);
+    case 32:
+      return isOperationLegalOrCustom(Op, EVT(MVT::f32), LegalOnly);
+    case 64:
+      return isOperationLegalOrCustom(Op, EVT(MVT::f64), LegalOnly);
+    default:
+      break;
+    }
+    return false;
+  }
+
   /// Return true if the specified operation is legal on this target or can be
   /// made legal with custom lowering. This is used to help guide high-level
   /// lowering decisions. LegalOnly is an optional convenience for code paths
@@ -2661,6 +2690,11 @@
     return isFPExtFree(DestVT, SrcVT);
   }
 
+  virtual bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
+                               LLT DestTy, LLT SrcTy) const {
+    return false;
+  }
+
   /// Return true if folding a vector load into ExtVal (a sign, zero, or any
   /// extend node) is profitable.
   virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; }
@@ -2697,6 +2731,30 @@
     return false;
   }
 
+  virtual bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                          const LLT) const {
+    return false;
+  }
+
+  virtual bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const {
+    assert((MI.getOpcode() == TargetOpcode::G_FADD ||
+            MI.getOpcode() == TargetOpcode::G_FSUB ||
+            MI.getOpcode() == TargetOpcode::G_FMUL) &&
+           "unexpected node in FMAD forming combine");
+    switch (Ty.getSizeInBits()) {
+    case 16:
+      return isOperationLegal(TargetOpcode::G_FMAD, EVT(MVT::f16));
+    case 32:
+      return isOperationLegal(TargetOpcode::G_FMAD, EVT(MVT::f32));
+    case 64:
+      return isOperationLegal(TargetOpcode::G_FMAD, EVT(MVT::f64));
+    default:
+      break;
+    }
+
+    return false;
+  }
+
   /// Returns true if be combined with to form an ISD::FMAD. \p N may be an
   /// ISD::FADD, ISD::FSUB, or an ISD::FMUL which will be distributed into an
   /// fadd/fsub.
Index: llvm/include/llvm/Target/GlobalISel/Combine.td
===================================================================
--- llvm/include/llvm/Target/GlobalISel/Combine.td
+++ llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -553,6 +553,18 @@
     [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
   (apply [{ return Helper.applyLoadOrCombine(*${root}, ${info}); }])>;
 
+// Transform (fadd x, (fmul y, z)) -> (fma y, z, x)
+//           (fadd (fmul x, y), z) -> (fma x, y, z)
+def combine_fadd_fmul_to_fma_info :
+    GIDefMatchData<"std::tuple<Register, Register, Register, unsigned>">;
+def combine_fadd_fmul_to_fma: GICombineRule<
+  (defs root:$root, combine_fadd_fmul_to_fma_info:$info),
+  (match (wip_match_opcode G_FADD):$root,
+         [{ return Helper.matchCombineFaddFmulToFmadOrFma(*${root},
+                                                          ${info}); }]),
+  (apply [{ return Helper.applyCombineFaddFmulToFmadOrFma(*${root},
+                                                          ${info}); }])>;
+
 // Currently only the one combine above.
 def insert_vec_elt_combines : GICombineGroup<
                             [combine_insert_vec_elts_build_vector]>;
@@ -595,4 +607,5 @@
     unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc,
     unmerge_zext_to_zext, trunc_ext_fold, trunc_shl,
     const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
-    shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>;
+    shift_immed_chain, shift_of_shifted_logic_chain,
+    combine_fadd_fmul_to_fma]>;
Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
===================================================================
--- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -113,6 +113,11 @@
   return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
 }
 
+bool CombinerHelper::isLegal(
+    const LegalityQuery &Query) const {
+  return LI && LI->getAction(Query).Action == LegalizeActions::Legal;
+}
+
 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
                                     Register ToReg) const {
   Observer.changingAllUsesOfReg(MRI, FromReg);
@@ -3567,6 +3572,105 @@
   return true;
 }
 
+bool CombinerHelper::isContractableFMUL(const MachineInstr &MI,
+                                        bool AllowFusionGlobally) {
+  if (MI.getOpcode() != TargetOpcode::G_FMUL)
+    return false;
+  return AllowFusionGlobally ||
+         MI.getFlag(MachineInstr::MIFlag::FmReassoc) ||
+         MI.getFlag(MachineInstr::MIFlag::FmContract);
+}
+
+bool CombinerHelper::matchCombineFaddFmulToFmadOrFma(
+    MachineInstr &MI,
+    std::tuple<Register, Register, Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  auto *MF = MI.getParent()->getParent();
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+  const TargetOptions &Options = MF->getTarget().Options;
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+  LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
+  MachineInstr *MI0 = MRI.getVRegDef(MI.getOperand(1).getReg());
+  MachineInstr *MI1 = MRI.getVRegDef(MI.getOperand(2).getReg());
+
+  bool LegalOperations =
+      isLegal({TargetOpcode::G_FADD, {DstType, SrcType}});
+  // Floating-point multiply-add with intermediate rounding.
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(MI, DstType));
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      TLI.isFMAFasterThanFMulAndFAdd(*MF, DstType) &&
+      (!LegalOperations || isLegal({TargetOpcode::G_FMA, {DstType, SrcType}}));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return false;
+
+  bool CanFuse =
+      Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmContract);
+  bool AllowFusionGlobally =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || CanFuse || HasFMAD);
+
+  // If the addition is not contractable, do not combine.
+  if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
+    return false;
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(DstType);
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive &&
+      isContractableFMUL(*MI0, AllowFusionGlobally) &&
+      isContractableFMUL(*MI1, AllowFusionGlobally)) {
+    if (std::distance(
+          MRI.use_instr_nodbg_begin(MI0->getOperand(0).getReg()),
+          MRI.use_instr_nodbg_end()) >
+        std::distance(
+          MRI.use_instr_nodbg_begin(MI1->getOperand(0).getReg()),
+          MRI.use_instr_nodbg_end()))
+      std::swap(MI0, MI1);
+  }
+
+  // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+  if (isContractableFMUL(*MI0, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(MI0->getOperand(0).getReg()))) {
+    MatchInfo = {MI0->getOperand(1).getReg(),
+                 MI0->getOperand(2).getReg(),
+                 MI1->getOperand(0).getReg(),
+                 PreferredFusedOpcode};
+    return true;
+  }
+
+  // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+  if (isContractableFMUL(*MI1, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))) {
+    MatchInfo = {MI1->getOperand(1).getReg(),
+                 MI1->getOperand(2).getReg(),
+                 MI0->getOperand(0).getReg(),
+                 PreferredFusedOpcode};
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::applyCombineFaddFmulToFmadOrFma(
+    MachineInstr &MI,
+    std::tuple<Register, Register, Register, unsigned> &MatchInfo) {
+  Register Src1, Src2, Src3;
+  unsigned PreferredFusedOpcode;
+  std::tie(Src1, Src2, Src3, PreferredFusedOpcode) = MatchInfo;
+
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildInstr(PreferredFusedOpcode,
+                     {MI.getOperand(0).getReg()}, {Src1, Src2, Src3});
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -248,6 +248,8 @@
 
   bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT,
                        EVT SrcVT) const override;
+  bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, LLT DestTy,
+                       LLT SrcTy) const override;
 
   bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
 
@@ -373,6 +375,7 @@
                               MachineBasicBlock *BB) const override;
 
   bool hasBitPreservingFPLogic(EVT VT) const override;
+  bool enableAggressiveFMAFusion(LLT Ty) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
@@ -381,6 +384,9 @@
 
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  const LLT Ty) const override;
+  bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const override;
   bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
 
   SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -854,6 +854,16 @@
     !hasFP32Denormals(DAG.getMachineFunction());
 }
 
+bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
+                                       LLT DestTy, LLT SrcTy) const {
+  return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
+          (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
+    DestTy.getSizeInBits() == 32 &&
+    SrcTy.getSizeInBits() == 16 &&
+    // TODO: This probably only requires no input flushing?
+    !hasFP32Denormals(*MI.getMF());
+}
+
 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
   // SI has some legal vector types, but no legal vector operations. Say no
   // shuffles are legal in order to prefer scalarizing some vector operations.
@@ -4320,6 +4330,20 @@
   return true;
 }
 
+bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const {
+  switch (Ty.getSizeInBits()) {
+    case 16:
+      return enableAggressiveFMAFusion(EVT(MVT::f16));
+    case 32:
+      return enableAggressiveFMAFusion(EVT(MVT::f32));
+    case 64:
+      return enableAggressiveFMAFusion(EVT(MVT::f64));
+    default:
+      break;
+    }
+    return true;
+}
+
 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                                          EVT VT) const {
   if (!VT.isVector()) {
@@ -4385,6 +4409,34 @@
   return false;
 }
 
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                                  const LLT Ty) const {
+  switch(Ty.getSizeInBits()) {
+  case 16:
+    return isFMAFasterThanFMulAndFAdd(MF, EVT(MVT::f16));
+  case 32:
+    return isFMAFasterThanFMulAndFAdd(MF, EVT(MVT::f32));
+  case 64:
+    return isFMAFasterThanFMulAndFAdd(MF, EVT(MVT::f64));
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool SITargetLowering::isFMADLegal(const MachineInstr &MI,
+                                   const LLT Ty) const {
+  if (Ty.getSizeInBits() == 16)
+    return Subtarget->hasMadF16() &&
+           !hasFP64FP16Denormals(*MI.getMF());
+  if (Ty.getSizeInBits() == 32)
+    return Subtarget->hasMadMacF32Insts() &&
+           !hasFP32Denormals(*MI.getMF());
+
+  return false;
+}
+
 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
                                    const SDNode *N) const {
   // TODO: Check future ftz flag
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+
+; fold (fadd (fmul x, y), z) -> (fma x, y, z)
+; fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+
+define float @test_f32_add_mul(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul_rhs:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul_rhs:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul_rhs:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul_rhs:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul float %x, %y
+  %b = fadd float %z, %a
+  ret float %b
+}
+
+define half @test_half_add_mul(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul half %x, %y
+  %b = fadd half %a, %z
+  ret half %b
+}
+
+define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul_rhs:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul_rhs:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul_rhs:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul_rhs:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul half %x, %y
+  %b = fadd half %z, %a
+  ret half %b
+}
+
+define double @test_double_add_mul(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_double_add_mul:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-DENORM-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
+
+define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul_rhs:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul_rhs:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_double_add_mul_rhs:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_add_f64 v[0:1], v[6:7], v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul_rhs:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[6:7], v[0:1]
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul double %x, %y
+  %b = fadd double %z, %a
+  ret double %b
+}
Index: llvm/test/CodeGen/PowerPC/fma-ext.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/fma-ext.ll
+++ llvm/test/CodeGen/PowerPC/fma-ext.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX %s
 
-define double @test_FMADD_EXT1(float %A, float %B, double %C) {
-    %D = fmul float %A, %B          ; <float> [#uses=1]
-    %E = fpext float %D to double   ; <double> [#uses=1]
-    %F = fadd double %E, %C         ; <double> [#uses=1]
-    ret double %F
+define float @test_FMADD_EXT1(half %A, half %B, float %C) #1 {
+    %D = fmul half %A, %B          ; <half> [#uses=1]
+    %E = fpext half %D to float   ; <float> [#uses=1]
+    %F = fadd float %E, %C         ; <float> [#uses=1]
+    ret float %F
 ; CHECK-LABEL: test_FMADD_EXT1:
 ; CHECK: fmadd
 ; CHECK-NEXT: blr
@@ -15,11 +15,11 @@
 ; CHECK-VSX-NEXT: blr
 }
 
-define double @test_FMADD_EXT2(float %A, float %B, double %C) {
-    %D = fmul float %A, %B          ; <float> [#uses=1]
-    %E = fpext float %D to double   ; <double> [#uses=1]
-    %F = fadd double %C, %E         ; <double> [#uses=1]
-    ret double %F
+define float @test_FMADD_EXT2(half %A, half %B, float %C) #1 {
+    %D = fmul half %A, %B          ; <half> [#uses=1]
+    %E = fpext half %D to float   ; <float> [#uses=1]
+    %F = fadd float %C, %E         ; <float> [#uses=1]
+    ret float %F
 ; CHECK-LABEL: test_FMADD_EXT2:
 ; CHECK: fmadd
 ; CHECK-NEXT: blr
@@ -29,11 +29,11 @@
 ; CHECK-VSX-NEXT: blr
 }
 
-define double @test_FMSUB_EXT1(float %A, float %B, double %C) {
-    %D = fmul float %A, %B          ; <float> [#uses=1]
-    %E = fpext float %D to double   ; <double> [#uses=1]
-    %F = fsub double %E, %C         ; <double> [#uses=1]
-    ret double %F
+define float @test_FMSUB_EXT1(half %A, half %B, float %C) #1 {
+    %D = fmul half %A, %B          ; <half> [#uses=1]
+    %E = fpext half %D to float   ; <float> [#uses=1]
+    %F = fsub float %E, %C         ; <float> [#uses=1]
+    ret float %F
 ; CHECK-LABEL: test_FMSUB_EXT1:
 ; CHECK: fmsub
 ; CHECK-NEXT: blr
@@ -43,11 +43,11 @@
 ; CHECK-VSX-NEXT: blr
 }
 
-define double @test_FMSUB_EXT2(float %A, float %B, double %C) {
-    %D = fmul float %A, %B          ; <float> [#uses=1]
-    %E = fpext float %D to double   ; <double> [#uses=1]
-    %F = fsub double %C, %E         ; <double> [#uses=1]
-    ret double %F
+define float @test_FMSUB_EXT2(half %A, half %B, float %C) #1 {
+    %D = fmul half %A, %B          ; <half> [#uses=1]
+    %E = fpext half %D to float   ; <float> [#uses=1]
+    %F = fsub float %C, %E         ; <float> [#uses=1]
+    ret float %F
 ; CHECK-LABEL: test_FMSUB_EXT2:
 ; CHECK: fneg
 ; CHECK-NEXT: fmadd
@@ -60,11 +60,11 @@
 }
 
 ; need nsz flag to generate fnmsub since it may affect sign of zero
-define double @test_FMSUB_EXT2_NSZ(float %A, float %B, double %C) {
-    %D = fmul nsz float %A, %B      ; <float> [#uses=1]
-    %E = fpext float %D to double   ; <double> [#uses=1]
-    %F = fsub nsz double %C, %E     ; <double> [#uses=1]
-    ret double %F
+define float @test_FMSUB_EXT2_NSZ(half %A, half %B, float %C) #1 {
+    %D = fmul nsz half %A, %B      ; <half> [#uses=1]
+    %E = fpext half %D to float   ; <float> [#uses=1]
+    %F = fsub nsz float %C, %E     ; <float> [#uses=1]
+    ret float %F
 ; CHECK-LABEL: test_FMSUB_EXT2_NSZ:
 ; CHECK: fnmsub
 ; CHECK-NEXT: blr
@@ -74,12 +74,12 @@
 ; CHECK-VSX-NEXT: blr
 }
 
-define double @test_FMSUB_EXT3(float %A, float %B, double %C) {
-    %D = fmul float %A, %B          ; <float> [#uses=1]
-    %E = fsub float -0.000000e+00, %D ;    <float> [#uses=1]
-    %F = fpext float %E to double   ; <double> [#uses=1]
-    %G = fsub double %F, %C         ; <double> [#uses=1]
-    ret double %G
+define float @test_FMSUB_EXT3(half %A, half %B, float %C) #1 {
+    %D = fmul half %A, %B          ; <half> [#uses=1]
+    %E = fsub half -0.000000e+00, %D ;    <half> [#uses=1]
+    %F = fpext half %E to float   ; <float> [#uses=1]
+    %G = fsub float %F, %C         ; <float> [#uses=1]
+    ret float %G
 ; CHECK-LABEL: test_FMSUB_EXT3:
 ; CHECK: fnmadd
 
@@ -91,12 +91,12 @@
 ; CHECK-VSX-NEXT: blr
 }
     
-define double @test_FMSUB_EXT4(float %A, float %B, double %C) {
-    %D = fmul float %A, %B          ; <float> [#uses=1]
-    %E = fpext float %D to double   ; <double> [#uses=1]
-    %F = fsub double -0.000000e+00, %E ;    <double> [#uses=1]
-    %G = fsub double %F, %C         ; <double> [#uses=1]
-    ret double %G
+define float @test_FMSUB_EXT4(half %A, half %B, float %C) #1 {
+    %D = fmul half %A, %B          ; <half> [#uses=1]
+    %E = fpext half %D to float   ; <float> [#uses=1]
+    %F = fsub float -0.000000e+00, %E ;    <float> [#uses=1]
+    %G = fsub float %F, %C         ; <float> [#uses=1]
+    ret float %G
 ; CHECK-LABEL: test_FMSUB_EXT4:
 ; CHECK: fnmadd
 
@@ -107,3 +107,5 @@
 
 ; CHECK-VSX-NEXT: blr
 }  
+
+attributes #1 = { "denormal-fp-math-f32"="preserve-sign" }