Index: clang/lib/CodeGen/BackendUtil.cpp
===================================================================
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -473,6 +473,18 @@
     break;
   }
 
+  // HIP toolchain does not support 'Fast' FPOpFusion in backends since it fuses
+  // multiplication/addition instructions without contract flag from device
+  // library functions in LLVM bitcode, which causes accuracy loss in certain
+  // math functions, e.g. tan(-1e20) becomes -0.933 instead of 0.8446. For
+  // device library functions in bitcode to work, 'Strict' or 'Standard'
+  // FPOpFusion options in backends is needed.
+  //
+  // Clang may need a separate option to control FPOpFusion option in backend
+  // instead of using -ffp-contract to control both frontend and backend.
+  if (LangOpts.HIP)
+    Options.AllowFPOpFusion = llvm::FPOpFusion::Standard;
+
   Options.UseInitArray = CodeGenOpts.UseInitArray;
   Options.DisableIntegratedAS = CodeGenOpts.DisableIntegratedAS;
   Options.CompressDebugSections = CodeGenOpts.getCompressDebugSections();
Index: clang/test/CodeGenCUDA/fp-contract.cu
===================================================================
--- clang/test/CodeGenCUDA/fp-contract.cu
+++ clang/test/CodeGenCUDA/fp-contract.cu
@@ -1,32 +1,241 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
+// REQUIRES: amdgpu-registered-target
 
-// By default we should fuse multiply/add into fma instruction.
+// By default CUDA/HIP uses -ffp-contract=fast.
+// we should fuse multiply/add into fma instruction.
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx -  assumes fast fp fuse option, which fuses
+//             mult/add insts disregarding contract flag and
+//             llvm.fmuladd intrinsics.
+//    amdgcn - assumes standard fp fuse option, which only
+//             fuses mult/add insts with contract flag and
+//             llvm.fmuladd intrinsics.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
-// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s
+// RUN:   -O3 -o - %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
 
 // Explicit -ffp-contract=fast
+// In IR, fmul/fadd instructions with contract flag are emitted.
+// In backend
+//    nvptx -  assumes fast fp fuse option, which fuses
+//             mult/add insts disregarding contract flag and
+//             llvm.fmuladd intrinsics.
+//    amdgcn - assumes standard fp fuse option, which only
+//             fuses mult/add insts with contract flag and
+//             llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix ENABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=fast \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=fast \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -ffp-contract=fast \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
 
 // Explicit -ffp-contract=on -- fusing by front-end.
+// In IR,
+//    mult/add in the same statement - llvm.fmuladd instrinsic emitted
+//    mult/add in different statement -  fmul/fadd instructions without
+//                                       contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix ENABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=on \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
+
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=on \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -ffp-contract=on \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
 
 // Explicit -ffp-contract=off should disable instruction fusing.
+// In IR, fmul/fadd instructions without contract flag are emitted.
+// In backend
+//    nvptx/amdgcn - assumes standard fp fuse option, which only
+//                   fuses mult/add insts with contract flag or
+//                   llvm.fmuladd intrinsics.
+
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 // RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
-// RUN:   | FileCheck -check-prefix DISABLED %s
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OFF %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -O3 -o - %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
+// RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
+// RUN:   -ffp-contract=off \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
 
+// Check separate compile/backend steps corresponding to -save-temps.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -ffp-contract=off \
+// RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
+// RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
+// RUN:   -ffp-contract=off \
+// RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
+// RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
 
 #include "Inputs/cuda.h"
 
+// Test multiply/add in the same statement, which can be emitted as FMA when
+// fp-contract is on or fast.
 __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
-// ENABLED:       fma.rn.f32
-// ENABLED-NEXT:  st.param.f32
+// COMMON-LABEL: _Z4funcfff
+// NV-ON:       fma.rn.f32
+// NV-ON-NEXT:  st.param.f32
+// AMD-ON:       v_fmac_f32_e64
+// AMD-ON-NEXT:  s_setpc_b64
+
+// NV-OFF:      mul.rn.f32
+// NV-OFF-NEXT: add.rn.f32
+// NV-OFF-NEXT: st.param.f32
+// AMD-OFF:      v_mul_f32_e64
+// AMD-OFF-NEXT: v_add_f32_e64
+// AMD-OFF-NEXT: s_setpc_b64
+
+// AMD-OPT-FAST-IR: fmul contract float
+// AMD-OPT-FAST-IR: fadd contract float
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-ON-IR: @llvm.fmuladd.f32
+// AMD-OPT-ON: v_fmac_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
+
+// Test multiply/add in the different statements, which can be emitted as
+// FMA when fp-contract is fast but not on.
+__host__ __device__ float func2(float a, float b, float c) {
+  float t = b * c;
+  return t + a;
+}
+// COMMON-LABEL: _Z5func2fff
+// NV-OPT-FAST:       fma.rn.f32
+// NV-OPT-FAST-NEXT:  st.param.f32
+
+// NV-OPT-ON:      mul.rn.f32
+// NV-OPT-ON:      add.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
+
+// AMD-OPT-FAST-IR: fmul contract float
+// AMD-OPT-FAST-IR: fadd contract float
+// AMD-OPT-FAST: v_fmac_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-ON-IR: fmul float
+// AMD-OPT-ON-IR: fadd float
+// AMD-OPT-ON: v_mul_f32_e32
+// AMD-OPT-ON-NEXT: v_add_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64
+
+// Test multiply/add in the different statements, which is forced
+// to be compiled with fp contract on. fmul/fadd without contract
+// flags are emitted in IR. In nvptx, they are emitted as FMA in
+// fp-contract is fast but not on, as nvptx backend uses the same
+// fp fuse option as front end, whereas fast fp fuse option in
+// backend fuses fadd/fmul disregarding contract flag. In amdgcn
+// they are not fused as amdgcn always use standard fp fusion
+// option which respects contract flag.
+  __host__ __device__ float func3(float a, float b, float c) {
+#pragma clang fp contract(on)
+  float t = b * c;
+  return t + a;
+}
+// COMMON-LABEL: _Z5func3fff
+// NV-OPT-FAST:       fma.rn.f32
+// NV-OPT-FAST-NEXT:  st.param.f32
+
+// NV-OPT-ON:      mul.rn.f32
+// NV-OPT-ON:      add.rn.f32
+// NV-OPT-ON-NEXT: st.param.f32
 
-// DISABLED:      mul.rn.f32
-// DISABLED-NEXT: add.rn.f32
-// DISABLED-NEXT: st.param.f32
+// AMD-OPT-FAST-IR: fmul float
+// AMD-OPT-FAST-IR: fadd float
+// AMD-OPT-FAST: v_mul_f32_e32
+// AMD-OPT-FAST-NEXT: v_add_f32_e32
+// AMD-OPT-FAST-NEXT: s_setpc_b64
+// AMD-OPT-ON-IR: fmul float
+// AMD-OPT-ON-IR: fadd float
+// AMD-OPT-ON: v_mul_f32_e32
+// AMD-OPT-ON-NEXT: v_add_f32_e32
+// AMD-OPT-ON-NEXT: s_setpc_b64
+// AMD-OPT-OFF-IR: fmul float
+// AMD-OPT-OFF-IR: fadd float
+// AMD-OPT-OFF: v_mul_f32_e32
+// AMD-OPT-OFF-NEXT: v_add_f32_e32
+// AMD-OPT-OFF-NEXT: s_setpc_b64