Index: clang/docs/LanguageExtensions.rst
===================================================================
--- clang/docs/LanguageExtensions.rst
+++ clang/docs/LanguageExtensions.rst
@@ -3209,8 +3209,9 @@
 
 The pragma can also be used with ``off`` which turns FP contraction off for a
 section of the code. This can be useful when fast contraction is otherwise
-enabled for the translation unit with the ``-ffp-contract=fast`` flag.
-
+enabled for the translation unit with the ``-ffp-contract=faststd`` flag.
+Note that ``-ffp-contract=fast`` will override pragmas to fuse multiply and
+addition across statements regardless of any controlling pragmas.
 
 ``#pragma clang fp exceptions`` specifies floating point exception behavior. It
 may take one the the values: ``ignore``, ``maytrap`` or ``strict``. Meaning of
Index: clang/docs/UsersManual.rst
===================================================================
--- clang/docs/UsersManual.rst
+++ clang/docs/UsersManual.rst
@@ -1291,15 +1291,16 @@
    The C standard permits intermediate floating-point results within an
    expression to be computed with more precision than their type would
    normally allow. This permits operation fusing, and Clang takes advantage
-   of this by default. This behavior can be controlled with the
-   ``FP_CONTRACT`` pragma. Please refer to the pragma documentation for a
-   description of how the pragma interacts with this option.
+   of this by default. This behavior can be controlled with the ``FP_CONTRACT``
+   and ``clang fp contract`` pragmas. Please refer to the pragma documentation
+   for a description of how the pragmas interact with this option.
 
    Valid values are:
 
-   * ``fast`` (everywhere)
-   * ``on`` (according to FP_CONTRACT pragma, default)
+   * ``fast`` (fuse across statements disregarding pragmas, default for CUDA)
+   * ``on`` (fuse in the same statement unless dictated by pragmas, default for languages other than CUDA/HIP)
    * ``off`` (never fuse)
+   * ``faststd`` (fuse across statements unless dictated by pragmas, default for HIP)
 
 .. _opt_fhonor-infinities:
 
Index: clang/include/clang/Basic/LangOptions.h
===================================================================
--- clang/include/clang/Basic/LangOptions.h
+++ clang/include/clang/Basic/LangOptions.h
@@ -187,8 +187,11 @@
     // Enable the floating point pragma
     FPM_On,
 
-    // Aggressively fuse FP ops (E.g. FMA).
-    FPM_Fast
+    // Aggressively fuse FP ops (E.g. FMA) disregarding pragmas.
+    FPM_Fast,
+
+    // Aggressively fuse FP ops and respect pragmas.
+    FPM_FastStd
   };
 
   /// Alias for RoundingMode::NearestTiesToEven.
@@ -410,7 +413,13 @@
   }
   explicit FPOptions(const LangOptions &LO) {
     Value = 0;
-    setFPContractMode(LO.getDefaultFPContractMode());
+    // The language fp contract option FPM_FastStd has the same effect as
+    // FPM_Fast in frontend. For simplicity, use FPM_Fast uniformly in
+    // frontend.
+    auto LangOptContractMode = LO.getDefaultFPContractMode();
+    if (LangOptContractMode == LangOptions::FPM_FastStd)
+      LangOptContractMode = LangOptions::FPM_Fast;
+    setFPContractMode(LangOptContractMode);
     setRoundingMode(LO.getFPRoundingMode());
     setFPExceptionMode(LO.getFPExceptionMode());
     setAllowFPReassociate(LO.AllowFPReassoc);
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1237,9 +1237,13 @@
 def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
-  Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
-  " | on (according to FP_CONTRACT pragma) | off (never fuse). Default"
-  " is 'fast' for CUDA/HIP and 'on' otherwise.">, Values<"fast,on,off">;
+  Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs):"
+  " fast (fuses across statements disregarding pragmas)"
+  " | on (only fuses in the same statement unless dictated by pragmas)"
+  " | off (never fuses)"
+  " | faststd (fuses across statements unless diectated by pragmas)."
+  " Default is 'fast' for CUDA, 'faststd' for HIP, and 'on' otherwise.">,
+  Values<"fast,on,off,faststd">;
 
 defm strict_float_cast_overflow : OptOutFFlag<"strict-float-cast-overflow",
   "Assume that overflowing float-to-int casts are undefined (default)",
Index: clang/lib/CodeGen/BackendUtil.cpp
===================================================================
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -476,6 +476,7 @@
     Options.AllowFPOpFusion = llvm::FPOpFusion::Standard;
     break;
   case LangOptions::FPM_On:
+  case LangOptions::FPM_FastStd:
     Options.AllowFPOpFusion = llvm::FPOpFusion::Standard;
     break;
   case LangOptions::FPM_Fast:
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2416,9 +2416,20 @@
 
   Opts.HIP = IK.getLanguage() == Language::HIP;
   Opts.CUDA = IK.getLanguage() == Language::CUDA || Opts.HIP;
-  if (Opts.CUDA)
-    // Set default FP_CONTRACT to FAST.
+  if (Opts.HIP) {
+    // HIP toolchain does not support 'Fast' FPOpFusion in backends since it
+    // fuses multiplication/addition instructions without contract flag from
+    // device library functions in LLVM bitcode, which causes accuracy loss in
+    // certain math functions, e.g. tan(-1e20) becomes -0.933 instead of 0.8446.
+    // For device library functions in bitcode to work, 'Strict' or 'Standard'
+    // FPOpFusion options in backends is needed. Therefore 'faststd' FP contract
+    // option is used to allow fuse across statements in frontend whereas
+    // respecting contract flag in backend.
+    Opts.setDefaultFPContractMode(LangOptions::FPM_FastStd);
+  } else if (Opts.CUDA) {
+    // Allow fuse across statements disregarding pragmas.
     Opts.setDefaultFPContractMode(LangOptions::FPM_Fast);
+  }
 
   Opts.RenderScript = IK.getLanguage() == Language::RenderScript;
   if (Opts.RenderScript) {
@@ -3379,6 +3390,8 @@
       Opts.setDefaultFPContractMode(LangOptions::FPM_On);
     else if (Val == "off")
       Opts.setDefaultFPContractMode(LangOptions::FPM_Off);
+    else if (Val == "faststd")
+      Opts.setDefaultFPContractMode(LangOptions::FPM_FastStd);
     else
       Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
   }
Index: clang/lib/Sema/SemaAttr.cpp
===================================================================
--- clang/lib/Sema/SemaAttr.cpp
+++ clang/lib/Sema/SemaAttr.cpp
@@ -966,6 +966,8 @@
   case LangOptions::FPM_Off:
     NewFPFeatures.setDisallowFPContract();
     break;
+  case LangOptions::FPM_FastStd:
+    llvm_unreachable("Should not happen");
   }
   FpPragmaStack.Act(Loc, Sema::PSK_Set, StringRef(), NewFPFeatures);
   CurFPFeatures = NewFPFeatures.applyOverrides(getLangOpts());
Index: clang/test/CodeGenCUDA/fp-contract.cu
===================================================================
--- clang/test/CodeGenCUDA/fp-contract.cu
+++ clang/test/CodeGenCUDA/fp-contract.cu
@@ -1,32 +1,298 @@
-// REQUIRES: x86-registered-target
-// REQUIRES: nvptx-registered-target
+// REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target
 
-// By default we should fuse multiply/add into fma instruction.
+// By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=faststd.
+// we should fuse multiply/add into fma instruction.
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx -  assumes fast fp fuse option, which fuses
+//             mult/add insts disregarding contract flag and
+//             llvm.fmuladd intrinsics.
+//    amdgcn - assumes standard fp fuse option, which only
+//             fuses mult/add insts with contract flag and
+//             llvm.fmuladd intrinsics.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
-// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s
+// RUN:   -O3 -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
 
 // Explicit -ffp-contract=fast
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes fast fp fuse option, which fuses
+//                   mult/add insts disregarding contract flag and
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix ENABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+// When input is IR, -ffp-contract has no effect. Backend uses default
+// default FP fuse option.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=fast \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
+
+// Explicit -ffp-contract=faststd
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=faststd -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=faststd \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=faststd \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=faststd \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+// When input is IR, -ffp-contract has no effect. Backend uses default
+// default FP fuse option.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=faststd \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
 
 // Explicit -ffp-contract=on -- fusing by front-end.
+// In IR,
+//    mult/add in the same statement - llvm.fmuladd instrinsic emitted
+//    mult/add in different statement -  fmul/fadd instructions without
+//                                       contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix ENABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=on \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
 
 // Explicit -ffp-contract=off should disable instruction fusing.
+// In IR, fmul/fadd instructions without contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix DISABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
 
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=off \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
 
 #include "Inputs/cuda.h"
 
+// Test multiply/add in the same statement, which can be emitted as FMA when
+// fp-contract is on or fast.
 __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
-// ENABLED:       fma.rn.f32
-// ENABLED-NEXT:  st.param.f32
+// COMMON-LABEL: _Z4funcfff
+// NV-ON:       fma.rn.f32
+// NV-ON-NEXT:  st.param.f32
+// AMD-ON:       v_fmac_f32_e64
+// AMD-ON-NEXT:  s_setpc_b64
+
+// NV-OFF:      mul.rn.f32
+// NV-OFF-NEXT: add.rn.f32
+// NV-OFF-NEXT: st.param.f32
+// AMD-OFF:      v_mul_f32_e64
+// AMD-OFF-NEXT: v_add_f32_e64
+// AMD-OFF-NEXT: s_setpc_b64
+
+// NV-OPT-FAST: fma.rn.f32
+// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FASTSTD: fma.rn.f32
+// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-ON: fma.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-OFF: mul.rn.f32
+// NV-OPT-OFF-NEXT: add.rn.f32
+// NV-OPT-OFF-NEXT: st.param.f32
+
+// AMD-OPT-FAST-IR: fmul contract float
+// AMD-OPT-FAST-IR: fadd contract float
+// AMD-OPT-ON-IR: @llvm.fmuladd.f32
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
+
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-FASTSTD: v_fmac_f32_e32
+// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
+// AMD-OPT-ON: v_fmac_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
+
+// Test multiply/add in the different statements, which can be emitted as
+// FMA when fp-contract is fast but not on.
+__host__ __device__ float func2(float a, float b, float c) {
+  float t = b * c;
+  return t + a;
+}
+// COMMON-LABEL: _Z5func2fff
+// NV-OPT-FAST: fma.rn.f32
+// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FASTSTD: fma.rn.f32
+// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-ON: mul.rn.f32
+// NV-OPT-ON: add.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-OFF: mul.rn.f32
+// NV-OPT-OFF: add.rn.f32
+// NV-OPT-OFF-NEXT: st.param.f32
+
+// AMD-OPT-FAST-IR: fmul contract float
+// AMD-OPT-FAST-IR: fadd contract float
+// AMD-OPT-ON-IR: fmul float
+// AMD-OPT-ON-IR: fadd float
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
+
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-FASTSTD: v_fmac_f32_e32
+// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
+// AMD-OPT-ON: v_mul_f32_e32
+// AMD-OPT-ON-NEXT: v_add_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
+
+// Test multiply/add in the different statements, which is forced
+// to be compiled with fp contract on. fmul/fadd without contract
+// flags are emitted in IR. In nvptx, they are emitted as FMA in
+// fp-contract is fast but not on, as nvptx backend uses the same
+// fp fuse option as front end, whereas fast fp fuse option in
+// backend fuses fadd/fmul disregarding contract flag. In amdgcn
+// they are not fused as amdgcn always use standard fp fusion
+// option which respects contract flag.
+  __host__ __device__ float func3(float a, float b, float c) {
+#pragma clang fp contract(on)
+  float t = b * c;
+  return t + a;
+}
+// COMMON-LABEL: _Z5func3fff
+// NV-OPT-FAST: fma.rn.f32
+// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FASTSTD: mul.rn.f32
+// NV-OPT-FASTSTD: add.rn.f32
+// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-ON: mul.rn.f32
+// NV-OPT-ON: add.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-OFF: mul.rn.f32
+// NV-OPT-OFF: add.rn.f32
+// NV-OPT-OFF-NEXT: st.param.f32
+
+// AMD-OPT-FAST-IR: fmul float
+// AMD-OPT-FAST-IR: fadd float
+// AMD-OPT-ON-IR: fmul float
+// AMD-OPT-ON-IR: fadd float
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
 
-// DISABLED:      mul.rn.f32
-// DISABLED-NEXT: add.rn.f32
-// DISABLED-NEXT: st.param.f32
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-FASTSTD: v_mul_f32_e32
+// AMD-OPT-FASTSTD-NEXT: v_add_f32_e32
+// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
+// AMD-OPT-ON: v_mul_f32_e32
+// AMD-OPT-ON-NEXT: v_add_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
Index: clang/test/Driver/autocomplete.c
===================================================================
--- clang/test/Driver/autocomplete.c
+++ clang/test/Driver/autocomplete.c
@@ -66,6 +66,7 @@
 // FNOSANICOVERALL-NEXT: trace-pc-guard
 // RUN: %clang --autocomplete=-ffp-contract= | FileCheck %s -check-prefix=FFPALL
 // FFPALL: fast
+// FFPALL-NEXT: faststd 
 // FFPALL-NEXT: off
 // FFPALL-NEXT: on
 // RUN: %clang --autocomplete=-flto= | FileCheck %s -check-prefix=FLTOALL