Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -1486,6 +1486,9 @@
   Opts.CUDA = IK == IK_CUDA || IK == IK_PreprocessedCuda ||
               LangStd == LangStandard::lang_cuda;
 
+  if (Opts.CUDA)
+    Opts.DefaultFPContract = 1;
+
   // OpenCL and C++ both have bool, true, false keywords.
   Opts.Bool = Opts.OpenCL || Opts.CPlusPlus;
 
Index: test/CodeGenCUDA/fp-contract.cu
===================================================================
--- /dev/null
+++ test/CodeGenCUDA/fp-contract.cu
@@ -0,0 +1,34 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// By default we should fuse multiply/add into llvm.fmuladd intrinsic
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
+// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s
+
+// Explicit -ffp-contract=off should disable instruction fusing.
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
+// RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+
+// Explicit -ffp-contract=fast lets LLVM do the fusing, so no fusing in clang.
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
+// RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+
+#include "Inputs/cuda.h"
+
+__host__ __device__ float func(float a, float b, float c) { return a + b * c; }
+// ENABLED:       call float @llvm.fmuladd.f32
+// ENABLED-NEXT:  ret
+
+// DISABLED:      fmul float
+// DISABLED-NEXT: fadd float
+// DISABLED-NEXT: ret