Index: clang/docs/LanguageExtensions.rst
===================================================================
--- clang/docs/LanguageExtensions.rst
+++ clang/docs/LanguageExtensions.rst
@@ -3209,7 +3209,9 @@
 
 The pragma can also be used with ``off`` which turns FP contraction off for a
 section of the code. This can be useful when fast contraction is otherwise
-enabled for the translation unit with the ``-ffp-contract=fast`` flag.
+enabled for the translation unit with the ``-ffp-contract=faststd`` flag. It
+should be noted that ``-ffp-contract=fast`` option will always fuse multiply
+and addition across statements disregarding fp contract controlling pragmas.
 
 
 ``#pragma clang fp exceptions`` specifies floating point exception behavior. It
Index: clang/docs/UsersManual.rst
===================================================================
--- clang/docs/UsersManual.rst
+++ clang/docs/UsersManual.rst
@@ -1292,14 +1292,17 @@
    expression to be computed with more precision than their type would
    normally allow. This permits operation fusing, and Clang takes advantage
    of this by default. This behavior can be controlled with the
-   ``FP_CONTRACT`` pragma. Please refer to the pragma documentation for a
-   description of how the pragma interacts with this option.
+   ``FP_CONTRACT`` pragma. Clang also supports ``clang fp contract`` pragma
+   for more flexible controlling of floating-point contractions in all
+   languages.  Please refer to the pragma documentation for a
+   description of how the pragmas interact with this option.
 
    Valid values are:
 
-   * ``fast`` (everywhere)
-   * ``on`` (according to FP_CONTRACT pragma, default)
+   * ``fast`` (fuse across statements disregarding pragmas, default for CUDA)
+   * ``on`` (fuse in the same statement unless dictated by pragmas, default for languages other than CUDA/HIP)
    * ``off`` (never fuse)
+   * ``faststd`` (fuse across statements unless dictated by pragmas, default for HIP)
 
 .. _opt_fhonor-infinities:
 
Index: clang/include/clang/Basic/LangOptions.h
===================================================================
--- clang/include/clang/Basic/LangOptions.h
+++ clang/include/clang/Basic/LangOptions.h
@@ -187,8 +187,11 @@
     // Enable the floating point pragma
     FPM_On,
 
-    // Aggressively fuse FP ops (E.g. FMA).
-    FPM_Fast
+    // Aggressively fuse FP ops (E.g. FMA) disregarding pragmas.
+    FPM_Fast,
+
+    // Aggressively fuse FP ops and respect pragmas.
+    FPM_FastStd
   };
 
   /// Alias for RoundingMode::NearestTiesToEven.
@@ -410,7 +413,13 @@
   }
   explicit FPOptions(const LangOptions &LO) {
     Value = 0;
-    setFPContractMode(LO.getDefaultFPContractMode());
+    // The language fp contract option FPM_FastStd has the same effect as
+    // FPM_Fast in frontend. For simplicity, use FPM_Fast uniformly in
+    // frontend.
+    auto LangOptContractMode = LO.getDefaultFPContractMode();
+    if (LangOptContractMode == LangOptions::FPM_FastStd)
+      LangOptContractMode = LangOptions::FPM_Fast;
+    setFPContractMode(LangOptContractMode);
     setRoundingMode(LO.getFPRoundingMode());
     setFPExceptionMode(LO.getFPExceptionMode());
     setAllowFPReassociate(LO.AllowFPReassoc);
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1237,9 +1237,13 @@
 def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
-  Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
-  " | on (according to FP_CONTRACT pragma) | off (never fuse). Default"
-  " is 'fast' for CUDA/HIP and 'on' otherwise.">, Values<"fast,on,off">;
+  Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs):"
+  " fast (fuses across statements disregarding pragmas)"
+  " | on (only fuses in the same statement unless dictated by pragmas)"
+  " | off (never fuses)"
+  " | faststd (fuses across statements unless diectated by pragmas)."
+  " Default is 'fast' for CUDA, 'faststd' for HIP, and 'on' otherwise.">,
+  Values<"fast,on,off,faststd">;
 
 defm strict_float_cast_overflow : OptOutFFlag<"strict-float-cast-overflow",
   "Assume that overflowing float-to-int casts are undefined (default)",
Index: clang/lib/CodeGen/BackendUtil.cpp
===================================================================
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -476,6 +476,7 @@
     Options.AllowFPOpFusion = llvm::FPOpFusion::Standard;
     break;
   case LangOptions::FPM_On:
+  case LangOptions::FPM_FastStd:
     Options.AllowFPOpFusion = llvm::FPOpFusion::Standard;
     break;
   case LangOptions::FPM_Fast:
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2416,9 +2416,20 @@
 
   Opts.HIP = IK.getLanguage() == Language::HIP;
   Opts.CUDA = IK.getLanguage() == Language::CUDA || Opts.HIP;
-  if (Opts.CUDA)
-    // Set default FP_CONTRACT to FAST.
+  if (Opts.HIP) {
+    // HIP toolchain does not support 'Fast' FPOpFusion in backends since it
+    // fuses multiplication/addition instructions without contract flag from
+    // device library functions in LLVM bitcode, which causes accuracy loss in
+    // certain math functions, e.g. tan(-1e20) becomes -0.933 instead of 0.8446.
+    // For device library functions in bitcode to work, 'Strict' or 'Standard'
+    // FPOpFusion options in backends is needed. Therefore 'faststd' FP contract
+    // option is used to allow fuse across statements in frontend whereas
+    // respecting contract flag in backend.
+    Opts.setDefaultFPContractMode(LangOptions::FPM_FastStd);
+  } else if (Opts.CUDA) {
+    // Allow fuse across statements disregarding pragmas.
     Opts.setDefaultFPContractMode(LangOptions::FPM_Fast);
+  }
 
   Opts.RenderScript = IK.getLanguage() == Language::RenderScript;
   if (Opts.RenderScript) {
@@ -3379,6 +3390,8 @@
       Opts.setDefaultFPContractMode(LangOptions::FPM_On);
     else if (Val == "off")
       Opts.setDefaultFPContractMode(LangOptions::FPM_Off);
+    else if (Val == "faststd")
+      Opts.setDefaultFPContractMode(LangOptions::FPM_FastStd);
     else
       Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
   }
Index: clang/lib/Sema/SemaAttr.cpp
===================================================================
--- clang/lib/Sema/SemaAttr.cpp
+++ clang/lib/Sema/SemaAttr.cpp
@@ -966,6 +966,8 @@
   case LangOptions::FPM_Off:
     NewFPFeatures.setDisallowFPContract();
     break;
+  case LangOptions::FPM_FastStd:
+    llvm_unreachable("Should not happen");
   }
   FpPragmaStack.Act(Loc, Sema::PSK_Set, StringRef(), NewFPFeatures);
   CurFPFeatures = NewFPFeatures.applyOverrides(getLangOpts());
Index: clang/test/CodeGenCUDA/fp-contract.cu
===================================================================
--- clang/test/CodeGenCUDA/fp-contract.cu
+++ clang/test/CodeGenCUDA/fp-contract.cu
@@ -1,32 +1,298 @@
-// REQUIRES: x86-registered-target
-// REQUIRES: nvptx-registered-target
+// REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target
 
-// By default we should fuse multiply/add into fma instruction.
+// By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=faststd.
+// we should fuse multiply/add into fma instruction.
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx -  assumes fast fp fuse option, which fuses
+//             mult/add insts disregarding contract flag and
+//             llvm.fmuladd intrinsics.
+//    amdgcn - assumes standard fp fuse option, which only
+//             fuses mult/add insts with contract flag and
+//             llvm.fmuladd intrinsics.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
-// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s
+// RUN:   -O3 -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
 
 // Explicit -ffp-contract=fast
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes fast fp fuse option, which fuses
+//                   mult/add insts disregarding contract flag and
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix ENABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+// When input is IR, -ffp-contract has no effect. Backend uses default
+// default FP fuse option.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=fast \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
+
+// Explicit -ffp-contract=faststd
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=faststd -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=faststd \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=faststd \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=faststd \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+// When input is IR, -ffp-contract has no effect. Backend uses default
+// default FP fuse option.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=faststd \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
 
 // Explicit -ffp-contract=on -- fusing by front-end.
+// In IR,
+//    mult/add in the same statement - llvm.fmuladd instrinsic emitted
+//    mult/add in different statement -  fmul/fadd instructions without
+//                                       contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix ENABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=on \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
 
 // Explicit -ffp-contract=off should disable instruction fusing.
+// In IR, fmul/fadd instructions without contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix DISABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
 
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=off \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
 
 #include "Inputs/cuda.h"
 
+// Test multiply/add in the same statement, which can be emitted as FMA when
+// fp-contract is on or fast.
 __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
-// ENABLED:       fma.rn.f32
-// ENABLED-NEXT:  st.param.f32
+// COMMON-LABEL: _Z4funcfff
+// NV-ON:       fma.rn.f32
+// NV-ON-NEXT:  st.param.f32
+// AMD-ON:       v_fmac_f32_e64
+// AMD-ON-NEXT:  s_setpc_b64
+
+// NV-OFF:      mul.rn.f32
+// NV-OFF-NEXT: add.rn.f32
+// NV-OFF-NEXT: st.param.f32
+// AMD-OFF:      v_mul_f32_e64
+// AMD-OFF-NEXT: v_add_f32_e64
+// AMD-OFF-NEXT: s_setpc_b64
+
+// NV-OPT-FAST: fma.rn.f32
+// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FASTSTD: fma.rn.f32
+// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-ON: fma.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-OFF: mul.rn.f32
+// NV-OPT-OFF-NEXT: add.rn.f32
+// NV-OPT-OFF-NEXT: st.param.f32
+
+// AMD-OPT-FAST-IR: fmul contract float
+// AMD-OPT-FAST-IR: fadd contract float
+// AMD-OPT-ON-IR: @llvm.fmuladd.f32
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
+
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-FASTSTD: v_fmac_f32_e32
+// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
+// AMD-OPT-ON: v_fmac_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
+
+// Test multiply/add in the different statements, which can be emitted as
+// FMA when fp-contract is fast but not on.
+__host__ __device__ float func2(float a, float b, float c) {
+  float t = b * c;
+  return t + a;
+}
+// COMMON-LABEL: _Z5func2fff
+// NV-OPT-FAST: fma.rn.f32
+// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FASTSTD: fma.rn.f32
+// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-ON: mul.rn.f32
+// NV-OPT-ON: add.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-OFF: mul.rn.f32
+// NV-OPT-OFF: add.rn.f32
+// NV-OPT-OFF-NEXT: st.param.f32
+
+// AMD-OPT-FAST-IR: fmul contract float
+// AMD-OPT-FAST-IR: fadd contract float
+// AMD-OPT-ON-IR: fmul float
+// AMD-OPT-ON-IR: fadd float
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
+
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-FASTSTD: v_fmac_f32_e32
+// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
+// AMD-OPT-ON: v_mul_f32_e32
+// AMD-OPT-ON-NEXT: v_add_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
+
+// Test multiply/add in the different statements, which is forced
+// to be compiled with fp contract on. fmul/fadd without contract
+// flags are emitted in IR. In nvptx, they are emitted as FMA in
+// fp-contract is fast but not on, as nvptx backend uses the same
+// fp fuse option as front end, whereas fast fp fuse option in
+// backend fuses fadd/fmul disregarding contract flag. In amdgcn
+// they are not fused as amdgcn always use standard fp fusion
+// option which respects contract flag.
+  __host__ __device__ float func3(float a, float b, float c) {
+#pragma clang fp contract(on)
+  float t = b * c;
+  return t + a;
+}
+// COMMON-LABEL: _Z5func3fff
+// NV-OPT-FAST: fma.rn.f32
+// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FASTSTD: mul.rn.f32
+// NV-OPT-FASTSTD: add.rn.f32
+// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-ON: mul.rn.f32
+// NV-OPT-ON: add.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-OFF: mul.rn.f32
+// NV-OPT-OFF: add.rn.f32
+// NV-OPT-OFF-NEXT: st.param.f32
+
+// AMD-OPT-FAST-IR: fmul float
+// AMD-OPT-FAST-IR: fadd float
+// AMD-OPT-ON-IR: fmul float
+// AMD-OPT-ON-IR: fadd float
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
 
-// DISABLED:      mul.rn.f32
-// DISABLED-NEXT: add.rn.f32
-// DISABLED-NEXT: st.param.f32
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-FASTSTD: v_mul_f32_e32
+// AMD-OPT-FASTSTD-NEXT: v_add_f32_e32
+// AMD-OPT-FASTSTD-NEXT: s_setpc_b64
+// AMD-OPT-ON: v_mul_f32_e32
+// AMD-OPT-ON-NEXT: v_add_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
Index: clang/test/Driver/autocomplete.c
===================================================================
--- clang/test/Driver/autocomplete.c
+++ clang/test/Driver/autocomplete.c
@@ -66,6 +66,7 @@
 // FNOSANICOVERALL-NEXT: trace-pc-guard
 // RUN: %clang --autocomplete=-ffp-contract= | FileCheck %s -check-prefix=FFPALL
 // FFPALL: fast
+// FFPALL-NEXT: faststd 
 // FFPALL-NEXT: off
 // FFPALL-NEXT: on
 // RUN: %clang --autocomplete=-flto= | FileCheck %s -check-prefix=FLTOALL