diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3114,6 +3114,9 @@
 
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
+    Flags.copyFMF(*FPOp);
+  }
   if (auto *OFBinOp = dyn_cast<OverflowingBinaryOperator>(&I)) {
     Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap());
     Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap());
diff --git a/llvm/test/CodeGen/X86/fused-fma.ll b/llvm/test/CodeGen/X86/fused-fma.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fused-fma.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mcpu=haswell | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
+target triple = "x86_64--linux-gnu"
+
+%jl_value_t = type opaque
+
+; This function should fuse the `fmul contract`, `fadd fast` into a vfma.
+; CHECK-LABEL: julia_dotf
+define double @julia_dotf(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40) %0) {
+; CHECK-LABEL: julia_dotf:
+; CHECK:       # %bb.0: # %L13.lr.ph
+; CHECK-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmovupd (%rax), %ymm1
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm1) + ymm0
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %middle.block
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+L13.lr.ph:
+  br label %vector.ph
+
+vector.ph:                                        ; preds = %L13.lr.ph
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %vec.phi10 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
+  %wide.load13 = load <4 x double>, <4 x double> addrspace(13)* undef, align 8
+  %wide.load17 = load <4 x double>, <4 x double> addrspace(13)* undef, align 8
+  %1 = fmul contract <4 x double> %wide.load13, %wide.load17
+  %2 = fadd fast <4 x double> %vec.phi10, %1
+  %3 = icmp eq i64 undef, undef
+  br i1 %3, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = fadd fast <4 x double> %2, undef
+  %bin.rdx20 = fadd fast <4 x double> undef, %bin.rdx
+  %bin.rdx21 = fadd fast <4 x double> undef, %bin.rdx20
+  %rdx.shuf = shufflevector <4 x double> %bin.rdx21, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx22 = fadd fast <4 x double> %bin.rdx21, %rdx.shuf
+  %rdx.shuf23 = shufflevector <4 x double> %bin.rdx22, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx24 = fadd fast <4 x double> %bin.rdx22, %rdx.shuf23
+  %4 = extractelement <4 x double> %bin.rdx24, i32 0
+  br label %L32
+
+L32:                                              ; preds = %middle.block
+  ret double %4
+}