Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -182,6 +182,7 @@
     Value *createFAdd(Value *Opnd0, Value *Opnd1);
     Value *createFMul(Value *Opnd0, Value *Opnd1);
     Value *createFDiv(Value *Opnd0, Value *Opnd1);
+    Value *createFMA(Value *Opnd0, Value *Opnd1, Value *Opnd2, bool IsFMulAdd);
     Value *createFNeg(Value *V);
     Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
     void createInstPostProc(Instruction *NewInst, bool NoNumber = false);
@@ -426,9 +427,37 @@
           I->getOpcode() == Instruction::FSub) && "Expect add/sub");
 
   Instruction *I0 = dyn_cast<Instruction>(I->getOperand(0));
-  Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
+  if (!I0)
+    return nullptr;
+
+  if (IntrinsicInst *II0 = dyn_cast<IntrinsicInst>(I0)) {
+    // (fadd (fmuladd x, y, (fmul u, v), z) -> (fmuladd x, y (fmuladd u, v, z))
+    //
+    // This could be done for fma, but whether that is faster is target
+    // dependent.
+    if (I->getOpcode() == Instruction::FAdd &&
+        II0->hasUnsafeAlgebra() &&
+        II0->getIntrinsicID() == Intrinsic::fmuladd &&
+        II0->hasOneUse()) {
+      Value *U, *V;
+      Value *FMA2 = II0->getArgOperand(2);
+      if (!FMA2->hasOneUse() ||
+          !match(FMA2, m_FMul(m_Value(U), m_Value(V))) ||
+          !cast<BinaryOperator>(FMA2)->hasUnsafeAlgebra())
+        return nullptr;
+
+      Value *X = II0->getArgOperand(0);
+      Value *Y = II0->getArgOperand(1);
+      Value *Z = I->getOperand(1);
+      Value *FMAUVZ = createFMA(U, V, Z, true);
+      return createFMA(X, Y, FMAUVZ, true);
+    }
 
-  if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode())
+    return nullptr;
+  }
+
+  Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
+  if (!I1 || I0->getOpcode() != I1->getOpcode())
     return nullptr;
 
   bool isMpy = false;
@@ -769,6 +798,18 @@
   return V;
 }
 
+Value *FAddCombine::createFMA(Value *Opnd0, Value *Opnd1, Value *Opnd2,
+                              bool FMulAdd) {
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+  Intrinsic::ID Opc = FMulAdd ? Intrinsic::fmuladd : Intrinsic::fma;
+  Value *F = Intrinsic::getDeclaration(M, Opc, Opnd0->getType());
+
+  Value *V = Builder->CreateCall(F, { Opnd0, Opnd1, Opnd2 });
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
 void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) {
   NewInstr->setDebugLoc(Instr->getDebugLoc());
 
Index: test/Transforms/InstCombine/fmuladd-opt.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/fmuladd-opt.ll
@@ -0,0 +1,160 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
+
+; CHECK-LABEL: @fast_add_fmuladd_fmul(
+; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float %z)
+; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float %y, float %1)
+; CHECK-NEXT: ret float %2
+define float @fast_add_fmuladd_fmul(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @fast_sub_fmuladd_fmul(
+; CHECK: %mul.u.v = fmul fast float %u, %v
+; CHECK-NEXT: %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+; CHECK-NEXT: %add = fsub fast float %fma, %z
+; CHECK-NEXT: ret float %add
+define float @fast_sub_fmuladd_fmul(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fsub fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @fast_add_fmuladd_fmul_multi_use_mul(
+; CHECK: fmul fast
+; CHECK: call fast float @llvm.fmuladd.f32(
+; CHECK: fadd fast
+define float @fast_add_fmuladd_fmul_multi_use_mul(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  store volatile float %mul.u.v, float* undef
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @fast_add_fmuladd_fmul_multi_use_fmuladd(
+; CHECK: fmul fast
+; CHECK: call fast float @llvm.fmuladd.f32(
+; CHECK: fadd fast
+define float @fast_add_fmuladd_fmul_multi_use_fmuladd(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  store volatile float %fma, float* undef
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_x(
+; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float %z)
+; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %y, float 8.000000e+00, float %1)
+; CHECK-NEXT: ret float %2
+define float @fast_add_fmuladd_fmul_constant_x(float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float 8.0, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_y(
+; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float %z)
+; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float 4.000000e+00, float %1)
+; CHECK-NEXT: ret float %2
+define float @fast_add_fmuladd_fmul_constant_y(float %x, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float 4.0, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_v(
+; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float 4.000000e+00, float %z)
+; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float %y, float %1)
+; CHECK-NEXT: ret float %2
+define float @fast_add_fmuladd_fmul_constant_v(float %x, float %y, float %z, float %u) {
+  %mul.u.v = fmul fast float %u, 4.0
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @fast_add_fmuladd_fmul_constant_z(
+; CHECK: %1 = call fast float @llvm.fmuladd.f32(float %u, float %v, float 4.000000e+00)
+; CHECK-NEXT: %2 = call fast float @llvm.fmuladd.f32(float %x, float %y, float %1)
+; CHECK-NEXT: ret float %2
+define float @fast_add_fmuladd_fmul_constant_z(float %x, float %y, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, 4.0
+  ret float %add
+}
+
+; CHECK-LABEL: @missing_fast_add_fmuladd_fmul_0(
+; CHECK: fmul float %u, %v
+; CHECK-NEXT: %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+; CHECK-NEXT: %add = fadd fast float %fma, %z
+; CHECK-NEXT: ret float %add
+define float @missing_fast_add_fmuladd_fmul_0(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @missing_fast_add_fmuladd_fmul_1(
+; CHECK: %mul.u.v = fmul fast float %u, %v
+; CHECK-NEXT: %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+; CHECK-NEXT: %add = fadd fast float %fma, %z
+; CHECK-NEXT: ret float %add
+define float @missing_fast_add_fmuladd_fmul_1(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @missing_fast_add_fmuladd_fmul_2(
+; CHECK: %mul.u.v = fmul fast float %u, %v
+; CHECK-NEXT: %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+; CHECK-NEXT: %add = fadd float %fma, %z
+; CHECK-NEXT: ret float %add
+define float @missing_fast_add_fmuladd_fmul_2(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd float %fma, %z
+  ret float %add
+}
+
+; CHECK-LABEL: @safe_add_fmuladd_fmul(
+; CHECK: %mul.u.v = fmul float %u, %v
+; CHECK-NEXT: %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+; CHECK-NEXT: %add = fadd float %fma, %z
+define float @safe_add_fmuladd_fmul(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul float %u, %v
+  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd float %fma, %z
+  ret float %add
+}
+
+; This is not done because it depends on the target whether it is
+; profitable.
+
+; CHECK-LABEL: @fast_add_fma_fmul(
+; CHECK-NEXT: %mul.u.v = fmul fast float %u, %v
+; CHECK-NEXT: %fmuladd = call fast float @llvm.fma.f32(float %x, float %y, float %mul.u.v)
+; CHECK-NEXT: %add = fadd fast float %fmuladd, %z
+; CHECK-NEXT: ret float %add
+define float @fast_add_fma_fmul(float %x, float %y, float %z, float %u, float %v) {
+  %mul.u.v = fmul fast float %u, %v
+  %fmuladd = call fast float @llvm.fma.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fmuladd, %z
+  ret float %add
+}
+
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+attributes #0 = { nounwind readnone }