Index: docs/LangRef.rst
===================================================================
--- docs/LangRef.rst
+++ docs/LangRef.rst
@@ -4608,13 +4608,13 @@
       int i;    // offset 0
       float f;  // offset 4
     };
-    
+
     struct Outer {
       float f;  // offset 0
       double d; // offset 4
       struct Inner inner_a;  // offset 12
     };
-    
+
     void f(struct Outer* outer, struct Inner* inner, float* f, int* i, char* c) {
       outer->f = 0;            // tag0: (OuterStructTy, FloatScalarTy, 0)
       outer->inner_a.i = 0;    // tag1: (OuterStructTy, IntScalarTy, 12)
@@ -5150,10 +5150,10 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``invariant.group`` metadata may be attached to ``load``/``store`` instructions.
-The existence of the ``invariant.group`` metadata on the instruction tells 
-the optimizer that every ``load`` and ``store`` to the same pointer operand 
-within the same invariant group can be assumed to load or store the same  
-value (but see the ``llvm.invariant.group.barrier`` intrinsic which affects 
+The existence of the ``invariant.group`` metadata on the instruction tells
+the optimizer that every ``load`` and ``store`` to the same pointer operand
+within the same invariant group can be assumed to load or store the same
+value (but see the ``llvm.invariant.group.barrier`` intrinsic which affects
 when two pointers are considered the same). Pointers returned by bitcast or
 getelementptr with only zero indices are considered the same.
 
@@ -5166,26 +5166,26 @@
    %ptr = alloca i8
    store i8 42, i8* %ptr, !invariant.group !0
    call void @foo(i8* %ptr)
-   
+
    %a = load i8, i8* %ptr, !invariant.group !0 ; Can assume that value under %ptr didn't change
    call void @foo(i8* %ptr)
    %b = load i8, i8* %ptr, !invariant.group !1 ; Can't assume anything, because group changed
-  
-   %newPtr = call i8* @getPointer(i8* %ptr) 
+
+   %newPtr = call i8* @getPointer(i8* %ptr)
    %c = load i8, i8* %newPtr, !invariant.group !0 ; Can't assume anything, because we only have information about %ptr
-   
+
    %unknownValue = load i8, i8* @unknownPtr
    store i8 %unknownValue, i8* %ptr, !invariant.group !0 ; Can assume that %unknownValue == 42
-   
+
    call void @foo(i8* %ptr)
    %newPtr2 = call i8* @llvm.invariant.group.barrier(i8* %ptr)
    %d = load i8, i8* %newPtr2, !invariant.group !0  ; Can't step through invariant.group.barrier to get value of %ptr
-   
+
    ...
    declare void @foo(i8*)
    declare i8* @getPointer(i8*)
    declare i8* @llvm.invariant.group.barrier(i8*)
-   
+
    !0 = !{!"magic ptr"}
    !1 = !{!"other ptr"}
 
@@ -5194,7 +5194,7 @@
 to the SSA value of the pointer operand.
 
 .. code-block:: llvm
-  
+
   %v = load i8, i8* %x, !invariant.group !0
   ; if %x mustalias %y then we can replace the above instruction with
   %v = load i8, i8* %y
@@ -6648,9 +6648,9 @@
 
 Note that unsigned integer remainder and signed integer remainder are
 distinct operations; for signed integer remainder, use '``srem``'.
- 
+
 Taking the remainder of a division by zero is undefined behavior.
-For vectors, if any element of the divisor is zero, the operation has 
+For vectors, if any element of the divisor is zero, the operation has
 undefined behavior.
 
 Example:
@@ -6702,7 +6702,7 @@
 distinct operations; for unsigned integer remainder, use '``urem``'.
 
 Taking the remainder of a division by zero is undefined behavior.
-For vectors, if any element of the divisor is zero, the operation has 
+For vectors, if any element of the divisor is zero, the operation has
 undefined behavior.
 Overflow also leads to undefined behavior; this is a rare case, but can
 occur, for example, by taking the remainder of a 32-bit division of
@@ -7575,7 +7575,7 @@
 instructions to save cache bandwidth, such as the ``MOVNT`` instruction on
 x86.
 
-The optional ``!invariant.group`` metadata must reference a 
+The optional ``!invariant.group`` metadata must reference a
 single metadata name ``<index>``. See ``invariant.group`` metadata.
 
 Semantics:
@@ -7681,10 +7681,10 @@
 to operate on, a value to compare to the value currently be at that
 address, and a new value to place at that address if the compared values
 are equal. The type of '<cmp>' must be an integer or pointer type whose
-bit width is a power of two greater than or equal to eight and less 
+bit width is a power of two greater than or equal to eight and less
 than or equal to a target-specific size limit. '<cmp>' and '<new>' must
-have the same type, and the type of '<pointer>' must be a pointer to 
-that type. If the ``cmpxchg`` is marked as ``volatile``, then the 
+have the same type, and the type of '<pointer>' must be a pointer to
+that type. If the ``cmpxchg`` is marked as ``volatile``, then the
 optimizer is not allowed to modify the number or order of execution of
 this ``cmpxchg`` with other :ref:`volatile operations <volatile>`.
 
@@ -8972,7 +8972,7 @@
    ``tail`` or ``musttail`` markers to the call. It is used to prevent tail
    call optimization from being performed on the call.
 
-#. The optional ``fast-math flags`` marker indicates that the call has one or more 
+#. The optional ``fast-math flags`` marker indicates that the call has one or more
    :ref:`fast-math flags <fastmath>`, which are optimization hints to enable
    otherwise unsafe floating-point optimizations. Fast-math flags are only valid
    for calls that return a floating-point scalar or vector type.
@@ -12714,7 +12714,7 @@
 Overview:
 """""""""
 
-The '``llvm.invariant.group.barrier``' intrinsic can be used when an invariant 
+The '``llvm.invariant.group.barrier``' intrinsic can be used when an invariant
 established by invariant.group metadata no longer holds, to obtain a new pointer
 value that does not carry the invariant information.
 
@@ -12728,7 +12728,7 @@
 Semantics:
 """"""""""
 
-Returns another pointer that aliases its argument but which is considered different 
+Returns another pointer that aliases its argument but which is considered different
 for the purposes of ``load``/``store`` ``invariant.group`` metadata.
 
 Constrained Floating Point Intrinsics
@@ -12806,7 +12806,7 @@
 Any FP exception that would have been raised by the original code must be raised
 by the transformed code, and the transformed code must not raise any FP
 exceptions that would not have been raised by the original code.  This is the
-exception behavior argument that will be used if the code being compiled reads 
+exception behavior argument that will be used if the code being compiled reads
 the FP exception status flags, but this mode can also be used with code that
 unmasks FP exceptions.
 
@@ -12824,7 +12824,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.fadd(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -12861,7 +12861,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.fsub(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -12898,7 +12898,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.fmul(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -12935,7 +12935,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.fdiv(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -12972,7 +12972,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.frem(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -13001,8 +13001,44 @@
 
 The value produced is the floating point remainder from the division of the two
 value operands and has the same type as the operands.  The remainder has the
-same sign as the dividend. 
+same sign as the dividend.
+
+'``llvm.experimental.constrained.fma``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.fma(<type> <op1>, <type> <op2>, <type> <op3>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.fma``' intrinsic returns the multiply-add result
+with multiply of the first and second operand and then add the third one.
+
+
+
+Arguments:
+""""""""""
+
+The first three arguments to the '``llvm.experimental.constrained.fma``'
+intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector <t_vector>`
+of floating point values. All arguments must have identical types.
+
+The fourth and fifth arguments specify the rounding mode and exception
+behavior as described above. Note that the rounding happens only once here.
+
+Semantics:
+""""""""""
 
+The value produced is the multiply the first two value operands and then add the value
+of the third operand. And the produced result has the same type as the operands.
 
 Constrained libm-equivalent Intrinsics
 --------------------------------------
@@ -13026,7 +13062,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.sqrt(<type> <op1>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -13063,7 +13099,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.pow(<type> <op1>, <type> <op2>,
                                          metadata <rounding mode>,
                                          metadata <exception behavior>)
@@ -13100,7 +13136,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.powi(<type> <op1>, i32 <op2>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -13139,7 +13175,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.sin(<type> <op1>,
                                          metadata <rounding mode>,
                                          metadata <exception behavior>)
@@ -13175,7 +13211,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.cos(<type> <op1>,
                                          metadata <rounding mode>,
                                          metadata <exception behavior>)
@@ -13211,7 +13247,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.exp(<type> <op1>,
                                          metadata <rounding mode>,
                                          metadata <exception behavior>)
@@ -13246,7 +13282,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.exp2(<type> <op1>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -13282,7 +13318,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.log(<type> <op1>,
                                          metadata <rounding mode>,
                                          metadata <exception behavior>)
@@ -13318,7 +13354,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.log10(<type> <op1>,
                                            metadata <rounding mode>,
                                            metadata <exception behavior>)
@@ -13353,7 +13389,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.log2(<type> <op1>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -13388,7 +13424,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.rint(<type> <op1>,
                                           metadata <rounding mode>,
                                           metadata <exception behavior>)
@@ -13427,7 +13463,7 @@
 
 ::
 
-      declare <type> 
+      declare <type>
       @llvm.experimental.constrained.nearbyint(<type> <op1>,
                                                metadata <rounding mode>,
                                                metadata <exception behavior>)
@@ -14167,7 +14203,7 @@
 memory from the source location to the destination location. These locations are not
 allowed to overlap. The memory copy is performed as a sequence of load/store operations
 where each access is guaranteed to be a multiple of ``element_size`` bytes wide and
-aligned at an ``element_size`` boundary. 
+aligned at an ``element_size`` boundary.
 
 The order of the copy is unspecified. The same value may be read from the source
 buffer many times, but only one write is issued to the destination buffer per
@@ -14242,7 +14278,7 @@
 of memory from the source location to the destination location. These locations
 are allowed to overlap. The memory copy is performed as a sequence of load/store
 operations where each access is guaranteed to be a multiple of ``element_size``
-bytes wide and aligned at an ``element_size`` boundary. 
+bytes wide and aligned at an ``element_size`` boundary.
 
 The order of the copy is unspecified. The same value may be read from the source
 buffer many times, but only one write is issued to the destination buffer per
@@ -14317,7 +14353,7 @@
 The '``llvm.memset.element.unordered.atomic.*``' intrinsic sets the ``len`` bytes of
 memory starting at the destination location to the given ``value``. The memory is
 set with a sequence of store operations where each access is guaranteed to be a
-multiple of ``element_size`` bytes wide and aligned at an ``element_size`` boundary. 
+multiple of ``element_size`` bytes wide and aligned at an ``element_size`` boundary.
 
 The order of the assignment is unspecified. Only one write is issued to the
 destination buffer per element. It is well defined to have concurrent reads and
Index: include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- include/llvm/CodeGen/ISDOpcodes.h
+++ include/llvm/CodeGen/ISDOpcodes.h
@@ -263,7 +263,7 @@
     /// They are used to limit optimizations while the DAG is being
     /// optimized.
     STRICT_FADD, STRICT_FSUB, STRICT_FMUL, STRICT_FDIV, STRICT_FREM,
-
+    STRICT_FMA,
     /// Constrained versions of libm-equivalent floating point intrinsics.
     /// These will be lowered to the equivalent non-constrained pseudo-op
     /// (or expanded to the equivalent library call) before final selection.
Index: include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- include/llvm/CodeGen/SelectionDAGNodes.h
+++ include/llvm/CodeGen/SelectionDAGNodes.h
@@ -623,13 +623,14 @@
   /// Test if this node is a strict floating point pseudo-op.
   bool isStrictFPOpcode() {
     switch (NodeType) {
-      default: 
+      default:
         return false;
       case ISD::STRICT_FADD:
       case ISD::STRICT_FSUB:
       case ISD::STRICT_FMUL:
       case ISD::STRICT_FDIV:
       case ISD::STRICT_FREM:
+      case ISD::STRICT_FMA:
       case ISD::STRICT_FSQRT:
       case ISD::STRICT_FPOW:
       case ISD::STRICT_FPOWI:
Index: include/llvm/IR/IntrinsicInst.h
===================================================================
--- include/llvm/IR/IntrinsicInst.h
+++ include/llvm/IR/IntrinsicInst.h
@@ -167,6 +167,7 @@
     };
 
     bool isUnaryOp() const;
+    bool isTernaryOp() const;
     RoundingMode getRoundingMode() const;
     ExceptionBehavior getExceptionBehavior() const;
 
@@ -178,6 +179,7 @@
       case Intrinsic::experimental_constrained_fmul:
       case Intrinsic::experimental_constrained_fdiv:
       case Intrinsic::experimental_constrained_frem:
+      case Intrinsic::experimental_constrained_fma:
       case Intrinsic::experimental_constrained_sqrt:
       case Intrinsic::experimental_constrained_pow:
       case Intrinsic::experimental_constrained_powi:
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -490,6 +490,13 @@
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
 
+  def int_experimental_constrained_fma : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      LLVMMatchType<0>,
+                                                      LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+
   // These intrinsics are sensitive to the rounding mode so we need constrained
   // versions of each of them.  When strict rounding and exception control are
   // not required the non-constrained versions of these intrinsics should be
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -907,6 +907,7 @@
     case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
     case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
     case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
+    case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
     case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
     case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
     case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
@@ -1072,6 +1073,7 @@
     }
     break;
   case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FMA:
   case ISD::STRICT_FPOW:
   case ISD::STRICT_FPOWI:
   case ISD::STRICT_FSIN:
@@ -1240,7 +1242,7 @@
       // If the index is dependent on the store we will introduce a cycle when
       // creating the load (the load uses the index, and by replacing the chain
       // we will make the index dependent on the load). Also, the store might be
-      // dependent on the extractelement and introduce a cycle when creating 
+      // dependent on the extractelement and introduce a cycle when creating
       // the load.
       if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) ||
           ST->hasPredecessor(Op.getNode()))
@@ -4060,6 +4062,10 @@
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
                                       RTLIB::FMA_F80, RTLIB::FMA_F128,
                                       RTLIB::FMA_PPCF128));
+  case ISD::STRICT_FMA:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
+                                      RTLIB::FMA_F80, RTLIB::FMA_F128,
+                                      RTLIB::FMA_PPCF128));
     break;
   case ISD::FADD:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6640,6 +6640,7 @@
   unsigned OrigOpc = Node->getOpcode();
   unsigned NewOpc;
   bool IsUnary = false;
+  bool IsTernary = false;
   switch (OrigOpc) {
   default:
     llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
@@ -6648,6 +6649,7 @@
   case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
   case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
   case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
+  case ISD::STRICT_FMA: NewOpc = ISD::FMA; IsTernary = true; break;
   case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
   case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
   case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
@@ -6674,6 +6676,10 @@
   SDNode *Res = nullptr;
   if (IsUnary)
     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
+  else if (IsTernary)
+    Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
+                                           Node->getOperand(2),
+                                           Node->getOperand(3)});
   else
     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
                                            Node->getOperand(2) });
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5432,6 +5432,7 @@
   case Intrinsic::experimental_constrained_fmul:
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
+  case Intrinsic::experimental_constrained_fma:
   case Intrinsic::experimental_constrained_sqrt:
   case Intrinsic::experimental_constrained_pow:
   case Intrinsic::experimental_constrained_powi:
@@ -5963,6 +5964,9 @@
   case Intrinsic::experimental_constrained_frem:
     Opcode = ISD::STRICT_FREM;
     break;
+  case Intrinsic::experimental_constrained_fma:
+    Opcode = ISD::STRICT_FMA;
+    break;
   case Intrinsic::experimental_constrained_sqrt:
     Opcode = ISD::STRICT_FSQRT;
     break;
@@ -6009,10 +6013,15 @@
   SDVTList VTs = DAG.getVTList(ValueVTs);
   SDValue Result;
   if (FPI.isUnaryOp())
-    Result = DAG.getNode(Opcode, sdl, VTs, 
+    Result = DAG.getNode(Opcode, sdl, VTs,
                          { Chain, getValue(FPI.getArgOperand(0)) });
+  else if (FPI.isTernaryOp())
+	 Result = DAG.getNode(Opcode, sdl, VTs,
+	                       { Chain, getValue(FPI.getArgOperand(0)),
+	                         getValue(FPI.getArgOperand(1)),
+	                         getValue(FPI.getArgOperand(2)) });
   else
-    Result = DAG.getNode(Opcode, sdl, VTs, 
+    Result = DAG.getNode(Opcode, sdl, VTs,
                          { Chain, getValue(FPI.getArgOperand(0)),
                            getValue(FPI.getArgOperand(1))  });
 
Index: lib/IR/IntrinsicInst.cpp
===================================================================
--- lib/IR/IntrinsicInst.cpp
+++ lib/IR/IntrinsicInst.cpp
@@ -14,10 +14,10 @@
 // are all subclasses of the CallInst class.  Note that none of these classes
 // has state or virtual methods, which is an important part of this gross/neat
 // hack working.
-// 
+//
 // In some cases, arguments to intrinsics need to be generic and are defined as
 // type pointer to empty struct { }*.  To access the real item of interest the
-// cast instruction needs to be stripped away. 
+// cast instruction needs to be stripped away.
 //
 //===----------------------------------------------------------------------===//
 
@@ -98,7 +98,7 @@
 ConstrainedFPIntrinsic::RoundingMode
 ConstrainedFPIntrinsic::getRoundingMode() const {
   unsigned NumOperands = getNumArgOperands();
-  Metadata *MD = 
+  Metadata *MD =
       dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 2))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return rmInvalid;
@@ -118,7 +118,7 @@
 ConstrainedFPIntrinsic::ExceptionBehavior
 ConstrainedFPIntrinsic::getExceptionBehavior() const {
   unsigned NumOperands = getNumArgOperands();
-  Metadata *MD = 
+  Metadata *MD =
       dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 1))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return ebInvalid;
@@ -132,7 +132,7 @@
 
 bool ConstrainedFPIntrinsic::isUnaryOp() const {
   switch (getIntrinsicID()) {
-    default: 
+    default:
       return false;
     case Intrinsic::experimental_constrained_sqrt:
     case Intrinsic::experimental_constrained_sin:
@@ -147,3 +147,13 @@
       return true;
   }
 }
+
+bool ConstrainedFPIntrinsic::isTernaryOp() const {
+  switch (getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::experimental_constrained_fma:
+      return true;
+  }
+}
+
Index: lib/IR/Verifier.cpp
===================================================================
--- lib/IR/Verifier.cpp
+++ lib/IR/Verifier.cpp
@@ -3969,6 +3969,7 @@
   case Intrinsic::experimental_constrained_fmul:
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
+  case Intrinsic::experimental_constrained_fma:
   case Intrinsic::experimental_constrained_sqrt:
   case Intrinsic::experimental_constrained_pow:
   case Intrinsic::experimental_constrained_powi:
@@ -4429,8 +4430,9 @@
 
 void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   unsigned NumOperands = FPI.getNumArgOperands();
-  Assert(((NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)),
-         "invalid arguments for constrained FP intrinsic", &FPI);
+  Assert(((NumOperands == 5 && FPI.isTernaryOp()) ||
+          (NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)),
+           "invalid arguments for constrained FP intrinsic", &FPI);
   Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-1)),
          "invalid exception behavior argument", &FPI);
   Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-2)),
Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp
+++ lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2012,6 +2012,14 @@
 
   switch (Opcode) {
   default: break;
+  case ISD::FMA: {
+    SDValue ISDFMA = CurDAG->getNode(X86ISD::FMADD, SDLoc(Node),
+                                     Node->getValueType(0), Node->getOperand(0),
+                                     Node->getOperand(1), Node->getOperand(2));
+    ReplaceUses(SDValue(Node, 0), ISDFMA);
+    SelectCode(ISDFMA.getNode());
+    return;
+  }
   case ISD::BRIND: {
     if (Subtarget->isTargetNaCl())
       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
Index: test/CodeGen/X86/fp-intrinsics.ll
===================================================================
--- test/CodeGen/X86/fp-intrinsics.ll
+++ test/CodeGen/X86/fp-intrinsics.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck --check-prefix=COMMON --check-prefix=NO-FMA --check-prefix=FMACALL64 --check-prefix=FMACALL32 %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck -check-prefix=COMMON --check-prefix=HAS-FMA --check-prefix=FMA64 --check-prefix=FMA32 %s
 
 ; Verify that constants aren't folded to inexact results when the rounding mode
 ; is unknown.
@@ -9,7 +10,7 @@
 ; }
 ;
 ; CHECK-LABEL: f1
-; CHECK: divsd
+; COMMON: divsd
 define double @f1() {
 entry:
   %div = call double @llvm.experimental.constrained.fdiv.f64(
@@ -29,7 +30,7 @@
 ; }
 ;
 ; CHECK-LABEL: f2
-; CHECK:  subsd
+; COMMON:  subsd
 define double @f2(double %a) {
 entry:
   %div = call double @llvm.experimental.constrained.fsub.f64(
@@ -50,9 +51,9 @@
 ; }
 ;
 ; CHECK-LABEL: f3:
-; CHECK:  subsd
-; CHECK:  mulsd
-; CHECK:  subsd
+; COMMON:  subsd
+; COMMON:  mulsd
+; COMMON:  subsd
 define double @f3(double %a, double %b) {
 entry:
   %sub = call double @llvm.experimental.constrained.fsub.f64(
@@ -81,11 +82,11 @@
 ;   return a;
 ; }
 ;
-; 
+;
 ; CHECK-LABEL: f4:
-; CHECK: testl
-; CHECK: jle
-; CHECK: addsd
+; COMMON: testl
+; COMMON: jle
+; COMMON: addsd
 define double @f4(i32 %n, double %a) {
 entry:
   %cmp = icmp sgt i32 %n, 0
@@ -105,7 +106,7 @@
 
 ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f5
-; CHECK:  sqrtsd
+; COMMON:  sqrtsd
 define double @f5() {
 entry:
   %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0,
@@ -116,7 +117,7 @@
 
 ; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f6
-; CHECK:  pow
+; COMMON:  pow
 define double @f6() {
 entry:
   %result = call double @llvm.experimental.constrained.pow.f64(double 42.1,
@@ -128,7 +129,7 @@
 
 ; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f7
-; CHECK:  powi
+; COMMON:  powi
 define double @f7() {
 entry:
   %result = call double @llvm.experimental.constrained.powi.f64(double 42.1,
@@ -140,7 +141,7 @@
 
 ; Verify that sin(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f8
-; CHECK:  sin
+; COMMON:  sin
 define double @f8() {
 entry:
   %result = call double @llvm.experimental.constrained.sin.f64(double 42.0,
@@ -151,7 +152,7 @@
 
 ; Verify that cos(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f9
-; CHECK:  cos
+; COMMON:  cos
 define double @f9() {
 entry:
   %result = call double @llvm.experimental.constrained.cos.f64(double 42.0,
@@ -162,7 +163,7 @@
 
 ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f10
-; CHECK:  exp
+; COMMON:  exp
 define double @f10() {
 entry:
   %result = call double @llvm.experimental.constrained.exp.f64(double 42.0,
@@ -173,7 +174,7 @@
 
 ; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f11
-; CHECK:  exp2
+; COMMON:  exp2
 define double @f11() {
 entry:
   %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1,
@@ -184,7 +185,7 @@
 
 ; Verify that log(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f12
-; CHECK:  log
+; COMMON:  log
 define double @f12() {
 entry:
   %result = call double @llvm.experimental.constrained.log.f64(double 42.0,
@@ -195,7 +196,7 @@
 
 ; Verify that log10(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f13
-; CHECK:  log10
+; COMMON:  log10
 define double @f13() {
 entry:
   %result = call double @llvm.experimental.constrained.log10.f64(double 42.0,
@@ -206,7 +207,7 @@
 
 ; Verify that log2(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f14
-; CHECK:  log2
+; COMMON:  log2
 define double @f14() {
 entry:
   %result = call double @llvm.experimental.constrained.log2.f64(double 42.0,
@@ -217,7 +218,8 @@
 
 ; Verify that rint(42.1) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f15
-; CHECK:  rint
+; NO-FMA:  rint
+; HAS-FMA: vroundsd
 define double @f15() {
 entry:
   %result = call double @llvm.experimental.constrained.rint.f64(double 42.1,
@@ -229,7 +231,8 @@
 ; Verify that nearbyint(42.1) isn't simplified when the rounding mode is
 ; unknown.
 ; CHECK-LABEL: f16
-; CHECK:  nearbyint
+; NO-FMA:  nearbyint
+; HAS-FMA: vroundsd
 define double @f16() {
 entry:
   %result = call double @llvm.experimental.constrained.nearbyint.f64(
@@ -239,6 +242,38 @@
   ret double %result
 }
 
+; Verify that fma(1.0) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f17
+; FMACALL32: jmp fmaf  # TAILCALL
+; FMA32: vfmadd132ss
+define float @f17() {
+entry:
+  %result = call float @llvm.experimental.constrained.fma.f32(
+                                               float 1.000000e+00,
+                                               float 2.000000e+00,
+                                               float 3.000000e+00,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret float %result
+}
+
+; Verify that fma(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f18
+; FMACALL64: jmp fma  # TAILCALL
+; FMA64: vfmadd213sd
+define double @f18() {
+entry:
+  %result = call double @llvm.experimental.constrained.fma.f64(
+                                               double 42.1,
+                                               double 42.1,
+                                               double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
@@ -256,3 +291,5 @@
 declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
Index: test/Feature/fp-intrinsics.ll
===================================================================
--- test/Feature/fp-intrinsics.ll
+++ test/Feature/fp-intrinsics.ll
@@ -73,7 +73,7 @@
 ;   return a;
 ; }
 ;
-; 
+;
 ; CHECK-LABEL: @f4
 ; CHECK-NOT: select
 ; CHECK: br i1 %cmp
@@ -94,7 +94,6 @@
   ret double %a.0
 }
 
-
 ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f5
 ; CHECK: call double @llvm.experimental.constrained.sqrt
@@ -231,6 +230,18 @@
   ret double %result
 }
 
+; Verify that fma(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f17
+; CHECK: call double @llvm.experimental.constrained.fma
+define double @f17() {
+entry:
+  %result = call double @llvm.experimental.constrained.fma.f64(double 42.1, double 42.1, double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
@@ -248,3 +259,4 @@
 declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)