Index: llvm/trunk/docs/LangRef.rst
===================================================================
--- llvm/trunk/docs/LangRef.rst
+++ llvm/trunk/docs/LangRef.rst
@@ -13021,6 +13021,41 @@
 value operands and has the same type as the operands.  The remainder has the
 same sign as the dividend. 
 
+'``llvm.experimental.constrained.fma``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.fma(<type> <op1>, <type> <op2>, <type> <op3>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.fma``' intrinsic returns the result of a
+fused-multiply-add operation on its operands.
+
+Arguments:
+""""""""""
+
+The first three arguments to the '``llvm.experimental.constrained.fma``'
+intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector
+<t_vector>` of floating point values. All arguments must have identical types.
+
+The fourth and fifth arguments specify the rounding mode and exception behavior
+as described above.
+
+Semantics:
+""""""""""
+
+The result produced is the product of the first two operands added to the third
+operand computed with infinite precision, and then rounded to the target
+precision.
 
 Constrained libm-equivalent Intrinsics
 --------------------------------------
Index: llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
@@ -263,6 +263,7 @@
     /// They are used to limit optimizations while the DAG is being
     /// optimized.
     STRICT_FADD, STRICT_FSUB, STRICT_FMUL, STRICT_FDIV, STRICT_FREM,
+    STRICT_FMA,
 
     /// Constrained versions of libm-equivalent floating point intrinsics.
     /// These will be lowered to the equivalent non-constrained pseudo-op
Index: llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -623,13 +623,14 @@
   /// Test if this node is a strict floating point pseudo-op.
   bool isStrictFPOpcode() {
     switch (NodeType) {
-      default: 
+      default:
         return false;
       case ISD::STRICT_FADD:
       case ISD::STRICT_FSUB:
       case ISD::STRICT_FMUL:
       case ISD::STRICT_FDIV:
       case ISD::STRICT_FREM:
+      case ISD::STRICT_FMA:
       case ISD::STRICT_FSQRT:
       case ISD::STRICT_FPOW:
       case ISD::STRICT_FPOWI:
Index: llvm/trunk/include/llvm/IR/IntrinsicInst.h
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicInst.h
+++ llvm/trunk/include/llvm/IR/IntrinsicInst.h
@@ -167,6 +167,7 @@
     };
 
     bool isUnaryOp() const;
+    bool isTernaryOp() const;
     RoundingMode getRoundingMode() const;
     ExceptionBehavior getExceptionBehavior() const;
 
@@ -178,6 +179,7 @@
       case Intrinsic::experimental_constrained_fmul:
       case Intrinsic::experimental_constrained_fdiv:
       case Intrinsic::experimental_constrained_frem:
+      case Intrinsic::experimental_constrained_fma:
       case Intrinsic::experimental_constrained_sqrt:
       case Intrinsic::experimental_constrained_pow:
       case Intrinsic::experimental_constrained_powi:
Index: llvm/trunk/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/trunk/include/llvm/IR/Intrinsics.td
+++ llvm/trunk/include/llvm/IR/Intrinsics.td
@@ -490,6 +490,13 @@
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
 
+  def int_experimental_constrained_fma : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      LLVMMatchType<0>,
+                                                      LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+
   // These intrinsics are sensitive to the rounding mode so we need constrained
   // versions of each of them.  When strict rounding and exception control are
   // not required the non-constrained versions of these intrinsics should be
Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -907,6 +907,7 @@
     case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
     case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
     case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
+    case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
     case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
     case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
     case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
@@ -1072,6 +1073,7 @@
     }
     break;
   case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FMA:
   case ISD::STRICT_FPOW:
   case ISD::STRICT_FPOWI:
   case ISD::STRICT_FSIN:
@@ -1240,7 +1242,7 @@
       // If the index is dependent on the store we will introduce a cycle when
       // creating the load (the load uses the index, and by replacing the chain
       // we will make the index dependent on the load). Also, the store might be
-      // dependent on the extractelement and introduce a cycle when creating 
+      // dependent on the extractelement and introduce a cycle when creating
       // the load.
       if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) ||
           ST->hasPredecessor(Op.getNode()))
@@ -4065,6 +4067,10 @@
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
                                       RTLIB::FMA_F80, RTLIB::FMA_F128,
                                       RTLIB::FMA_PPCF128));
+  case ISD::STRICT_FMA:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
+                                      RTLIB::FMA_F80, RTLIB::FMA_F128,
+                                      RTLIB::FMA_PPCF128));
     break;
   case ISD::FADD:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6695,6 +6695,7 @@
   unsigned OrigOpc = Node->getOpcode();
   unsigned NewOpc;
   bool IsUnary = false;
+  bool IsTernary = false;
   switch (OrigOpc) {
   default:
     llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
@@ -6703,6 +6704,7 @@
   case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
   case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
   case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
+  case ISD::STRICT_FMA: NewOpc = ISD::FMA; IsTernary = true; break;
   case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
   case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
   case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
@@ -6729,6 +6731,10 @@
   SDNode *Res = nullptr;
   if (IsUnary)
     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
+  else if (IsTernary)
+    Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
+                                           Node->getOperand(2),
+                                           Node->getOperand(3)});
   else
     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
                                            Node->getOperand(2) });
Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5432,6 +5432,7 @@
   case Intrinsic::experimental_constrained_fmul:
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
+  case Intrinsic::experimental_constrained_fma:
   case Intrinsic::experimental_constrained_sqrt:
   case Intrinsic::experimental_constrained_pow:
   case Intrinsic::experimental_constrained_powi:
@@ -5963,6 +5964,9 @@
   case Intrinsic::experimental_constrained_frem:
     Opcode = ISD::STRICT_FREM;
     break;
+  case Intrinsic::experimental_constrained_fma:
+    Opcode = ISD::STRICT_FMA;
+    break;
   case Intrinsic::experimental_constrained_sqrt:
     Opcode = ISD::STRICT_FSQRT;
     break;
@@ -6009,10 +6013,15 @@
   SDVTList VTs = DAG.getVTList(ValueVTs);
   SDValue Result;
   if (FPI.isUnaryOp())
-    Result = DAG.getNode(Opcode, sdl, VTs, 
+    Result = DAG.getNode(Opcode, sdl, VTs,
                          { Chain, getValue(FPI.getArgOperand(0)) });
+  else if (FPI.isTernaryOp())
+    Result = DAG.getNode(Opcode, sdl, VTs,
+                         { Chain, getValue(FPI.getArgOperand(0)),
+                                  getValue(FPI.getArgOperand(1)),
+                                  getValue(FPI.getArgOperand(2)) });
   else
-    Result = DAG.getNode(Opcode, sdl, VTs, 
+    Result = DAG.getNode(Opcode, sdl, VTs,
                          { Chain, getValue(FPI.getArgOperand(0)),
                            getValue(FPI.getArgOperand(1))  });
 
Index: llvm/trunk/lib/IR/IntrinsicInst.cpp
===================================================================
--- llvm/trunk/lib/IR/IntrinsicInst.cpp
+++ llvm/trunk/lib/IR/IntrinsicInst.cpp
@@ -14,10 +14,10 @@
 // are all subclasses of the CallInst class.  Note that none of these classes
 // has state or virtual methods, which is an important part of this gross/neat
 // hack working.
-// 
+//
 // In some cases, arguments to intrinsics need to be generic and are defined as
 // type pointer to empty struct { }*.  To access the real item of interest the
-// cast instruction needs to be stripped away. 
+// cast instruction needs to be stripped away.
 //
 //===----------------------------------------------------------------------===//
 
@@ -98,7 +98,7 @@
 ConstrainedFPIntrinsic::RoundingMode
 ConstrainedFPIntrinsic::getRoundingMode() const {
   unsigned NumOperands = getNumArgOperands();
-  Metadata *MD = 
+  Metadata *MD =
       dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 2))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return rmInvalid;
@@ -118,7 +118,7 @@
 ConstrainedFPIntrinsic::ExceptionBehavior
 ConstrainedFPIntrinsic::getExceptionBehavior() const {
   unsigned NumOperands = getNumArgOperands();
-  Metadata *MD = 
+  Metadata *MD =
       dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 1))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return ebInvalid;
@@ -132,7 +132,7 @@
 
 bool ConstrainedFPIntrinsic::isUnaryOp() const {
   switch (getIntrinsicID()) {
-    default: 
+    default:
       return false;
     case Intrinsic::experimental_constrained_sqrt:
     case Intrinsic::experimental_constrained_sin:
@@ -147,3 +147,13 @@
       return true;
   }
 }
+
+bool ConstrainedFPIntrinsic::isTernaryOp() const {
+  switch (getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::experimental_constrained_fma:
+      return true;
+  }
+}
+
Index: llvm/trunk/lib/IR/Verifier.cpp
===================================================================
--- llvm/trunk/lib/IR/Verifier.cpp
+++ llvm/trunk/lib/IR/Verifier.cpp
@@ -3973,6 +3973,7 @@
   case Intrinsic::experimental_constrained_fmul:
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
+  case Intrinsic::experimental_constrained_fma:
   case Intrinsic::experimental_constrained_sqrt:
   case Intrinsic::experimental_constrained_pow:
   case Intrinsic::experimental_constrained_powi:
@@ -4433,8 +4434,9 @@
 
 void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   unsigned NumOperands = FPI.getNumArgOperands();
-  Assert(((NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)),
-         "invalid arguments for constrained FP intrinsic", &FPI);
+  Assert(((NumOperands == 5 && FPI.isTernaryOp()) ||
+          (NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)),
+           "invalid arguments for constrained FP intrinsic", &FPI);
   Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-1)),
          "invalid exception behavior argument", &FPI);
   Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-2)),
Index: llvm/trunk/test/CodeGen/X86/fp-intrinsics.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/fp-intrinsics.ll
+++ llvm/trunk/test/CodeGen/X86/fp-intrinsics.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck --check-prefix=COMMON --check-prefix=NO-FMA --check-prefix=FMACALL64 --check-prefix=FMACALL32 %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck -check-prefix=COMMON --check-prefix=HAS-FMA --check-prefix=FMA64 --check-prefix=FMA32 %s
 
 ; Verify that constants aren't folded to inexact results when the rounding mode
 ; is unknown.
@@ -9,7 +10,7 @@
 ; }
 ;
 ; CHECK-LABEL: f1
-; CHECK: divsd
+; COMMON: divsd
 define double @f1() {
 entry:
   %div = call double @llvm.experimental.constrained.fdiv.f64(
@@ -29,7 +30,7 @@
 ; }
 ;
 ; CHECK-LABEL: f2
-; CHECK:  subsd
+; COMMON:  subsd
 define double @f2(double %a) {
 entry:
   %div = call double @llvm.experimental.constrained.fsub.f64(
@@ -50,9 +51,9 @@
 ; }
 ;
 ; CHECK-LABEL: f3:
-; CHECK:  subsd
-; CHECK:  mulsd
-; CHECK:  subsd
+; COMMON:  subsd
+; COMMON:  mulsd
+; COMMON:  subsd
 define double @f3(double %a, double %b) {
 entry:
   %sub = call double @llvm.experimental.constrained.fsub.f64(
@@ -81,11 +82,11 @@
 ;   return a;
 ; }
 ;
-; 
+;
 ; CHECK-LABEL: f4:
-; CHECK: testl
-; CHECK: jle
-; CHECK: addsd
+; COMMON: testl
+; COMMON: jle
+; COMMON: addsd
 define double @f4(i32 %n, double %a) {
 entry:
   %cmp = icmp sgt i32 %n, 0
@@ -105,7 +106,7 @@
 
 ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f5
-; CHECK:  sqrtsd
+; COMMON:  sqrtsd
 define double @f5() {
 entry:
   %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0,
@@ -116,7 +117,7 @@
 
 ; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f6
-; CHECK:  pow
+; COMMON:  pow
 define double @f6() {
 entry:
   %result = call double @llvm.experimental.constrained.pow.f64(double 42.1,
@@ -128,7 +129,7 @@
 
 ; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f7
-; CHECK:  powi
+; COMMON:  powi
 define double @f7() {
 entry:
   %result = call double @llvm.experimental.constrained.powi.f64(double 42.1,
@@ -140,7 +141,7 @@
 
 ; Verify that sin(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f8
-; CHECK:  sin
+; COMMON:  sin
 define double @f8() {
 entry:
   %result = call double @llvm.experimental.constrained.sin.f64(double 42.0,
@@ -151,7 +152,7 @@
 
 ; Verify that cos(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f9
-; CHECK:  cos
+; COMMON:  cos
 define double @f9() {
 entry:
   %result = call double @llvm.experimental.constrained.cos.f64(double 42.0,
@@ -162,7 +163,7 @@
 
 ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f10
-; CHECK:  exp
+; COMMON:  exp
 define double @f10() {
 entry:
   %result = call double @llvm.experimental.constrained.exp.f64(double 42.0,
@@ -173,7 +174,7 @@
 
 ; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f11
-; CHECK:  exp2
+; COMMON:  exp2
 define double @f11() {
 entry:
   %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1,
@@ -184,7 +185,7 @@
 
 ; Verify that log(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f12
-; CHECK:  log
+; COMMON:  log
 define double @f12() {
 entry:
   %result = call double @llvm.experimental.constrained.log.f64(double 42.0,
@@ -195,7 +196,7 @@
 
 ; Verify that log10(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f13
-; CHECK:  log10
+; COMMON:  log10
 define double @f13() {
 entry:
   %result = call double @llvm.experimental.constrained.log10.f64(double 42.0,
@@ -206,7 +207,7 @@
 
 ; Verify that log2(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f14
-; CHECK:  log2
+; COMMON:  log2
 define double @f14() {
 entry:
   %result = call double @llvm.experimental.constrained.log2.f64(double 42.0,
@@ -217,7 +218,8 @@
 
 ; Verify that rint(42.1) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f15
-; CHECK:  rint
+; NO-FMA:  rint
+; HAS-FMA: vroundsd
 define double @f15() {
 entry:
   %result = call double @llvm.experimental.constrained.rint.f64(double 42.1,
@@ -229,7 +231,8 @@
 ; Verify that nearbyint(42.1) isn't simplified when the rounding mode is
 ; unknown.
 ; CHECK-LABEL: f16
-; CHECK:  nearbyint
+; NO-FMA:  nearbyint
+; HAS-FMA: vroundsd
 define double @f16() {
 entry:
   %result = call double @llvm.experimental.constrained.nearbyint.f64(
@@ -239,6 +242,38 @@
   ret double %result
 }
 
+; Verify that fma(3.5) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f17
+; FMACALL32: jmp fmaf  # TAILCALL
+; FMA32: vfmadd213ss
+define float @f17() {
+entry:
+  %result = call float @llvm.experimental.constrained.fma.f32(
+                                               float 3.5,
+                                               float 3.5,
+                                               float 3.5,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret float %result
+}
+
+; Verify that fma(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f18
+; FMACALL64: jmp fma  # TAILCALL
+; FMA64: vfmadd213sd
+define double @f18() {
+entry:
+  %result = call double @llvm.experimental.constrained.fma.f64(
+                                               double 42.1,
+                                               double 42.1,
+                                               double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
@@ -256,3 +291,5 @@
 declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
Index: llvm/trunk/test/Feature/fp-intrinsics.ll
===================================================================
--- llvm/trunk/test/Feature/fp-intrinsics.ll
+++ llvm/trunk/test/Feature/fp-intrinsics.ll
@@ -73,7 +73,7 @@
 ;   return a;
 ; }
 ;
-; 
+;
 ; CHECK-LABEL: @f4
 ; CHECK-NOT: select
 ; CHECK: br i1 %cmp
@@ -94,7 +94,6 @@
   ret double %a.0
 }
 
-
 ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f5
 ; CHECK: call double @llvm.experimental.constrained.sqrt
@@ -231,6 +230,18 @@
   ret double %result
 }
 
+; Verify that fma(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f17
+; CHECK: call double @llvm.experimental.constrained.fma
+define double @f17() {
+entry:
+  %result = call double @llvm.experimental.constrained.fma.f64(double 42.1, double 42.1, double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
@@ -248,3 +259,4 @@
 declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)