Index: clang/lib/CodeGen/BackendUtil.cpp =================================================================== --- clang/lib/CodeGen/BackendUtil.cpp +++ clang/lib/CodeGen/BackendUtil.cpp @@ -473,6 +473,18 @@ break; } + // HIP toolchain does not support 'Fast' FPOpFusion in backends since it fuses + // multiplication/addition instructions without contract flag from device + // library functions in LLVM bitcode, which causes accuracy loss in certain + // math functions, e.g. tan(-1e20) becomes -0.933 instead of 0.8446. For + // device library functions in bitcode to work, 'Strict' or 'Standard' + // FPOpFusion options in backends is needed. + // + // Clang may need a separate option to control FPOpFusion option in backend + // instead of using -ffp-contract to control both frontend and backend. + if (LangOpts.HIP) + Options.AllowFPOpFusion = llvm::FPOpFusion::Standard; + Options.UseInitArray = CodeGenOpts.UseInitArray; Options.DisableIntegratedAS = CodeGenOpts.DisableIntegratedAS; Options.CompressDebugSections = CodeGenOpts.getCompressDebugSections(); Index: clang/test/CodeGenCUDA/fp-contract.cu =================================================================== --- clang/test/CodeGenCUDA/fp-contract.cu +++ clang/test/CodeGenCUDA/fp-contract.cu @@ -1,32 +1,241 @@ // REQUIRES: x86-registered-target // REQUIRES: nvptx-registered-target +// REQUIRES: amdgpu-registered-target -// By default we should fuse multiply/add into fma instruction. +// By default CUDA/HIP uses -ffp-contract=fast. +// we should fuse multiply/add into fma instruction. +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx - assumes fast fp fuse option, which fuses +// mult/add insts disregarding contract flag and +// llvm.fmuladd intrinsics. +// amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag and +// llvm.fmuladd intrinsics. + +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ -// RUN: -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s +// RUN: -O3 -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s + +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s // Explicit -ffp-contract=fast +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx - assumes fast fp fuse option, which fuses +// mult/add insts disregarding contract flag and +// llvm.fmuladd intrinsics. +// amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag and +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=fast -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix ENABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s + +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=fast \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -ffp-contract=fast \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s // Explicit -ffp-contract=on -- fusing by front-end. +// In IR, +// mult/add in the same statement - llvm.fmuladd instrinsic emitted +// mult/add in different statement - fmul/fadd instructions without +// contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=on -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix ENABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s + +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=on \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -ffp-contract=on \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s // Explicit -ffp-contract=off should disable instruction fusing. +// In IR, fmul/fadd instructions without contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=off -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix DISABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=off \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -ffp-contract=off \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s #include "Inputs/cuda.h" +// Test multiply/add in the same statement, which can be emitted as FMA when +// fp-contract is on or fast. __host__ __device__ float func(float a, float b, float c) { return a + b * c; } -// ENABLED: fma.rn.f32 -// ENABLED-NEXT: st.param.f32 +// COMMON-LABEL: _Z4funcfff +// NV-ON: fma.rn.f32 +// NV-ON-NEXT: st.param.f32 +// AMD-ON: v_fmac_f32_e64 +// AMD-ON-NEXT: s_setpc_b64 + +// NV-OFF: mul.rn.f32 +// NV-OFF-NEXT: add.rn.f32 +// NV-OFF-NEXT: st.param.f32 +// AMD-OFF: v_mul_f32_e64 +// AMD-OFF-NEXT: v_add_f32_e64 +// AMD-OFF-NEXT: s_setpc_b64 + +// AMD-OPT-FAST-IR: fmul contract float +// AMD-OPT-FAST-IR: fadd contract float +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-ON-IR: @llvm.fmuladd.f32 +// AMD-OPT-ON: v_fmac_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 + +// Test multiply/add in the different statements, which can be emitted as +// FMA when fp-contract is fast but not on. +__host__ __device__ float func2(float a, float b, float c) { + float t = b * c; + return t + a; +} +// COMMON-LABEL: _Z5func2fff +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 + +// NV-OPT-ON: mul.rn.f32 +// NV-OPT-ON: add.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 + +// AMD-OPT-FAST-IR: fmul contract float +// AMD-OPT-FAST-IR: fadd contract float +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-ON-IR: fmul float +// AMD-OPT-ON-IR: fadd float +// AMD-OPT-ON: v_mul_f32_e32 +// AMD-OPT-ON-NEXT: v_add_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 + +// Test multiply/add in the different statements, which is forced +// to be compiled with fp contract on. fmul/fadd without contract +// flags are emitted in IR. In nvptx, they are emitted as FMA in +// fp-contract is fast but not on, as nvptx backend uses the same +// fp fuse option as front end, whereas fast fp fuse option in +// backend fuses fadd/fmul disregarding contract flag. In amdgcn +// they are not fused as amdgcn always use standard fp fusion +// option which respects contract flag. + __host__ __device__ float func3(float a, float b, float c) { +#pragma clang fp contract(on) + float t = b * c; + return t + a; +} +// COMMON-LABEL: _Z5func3fff +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 + +// NV-OPT-ON: mul.rn.f32 +// NV-OPT-ON: add.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 -// DISABLED: mul.rn.f32 -// DISABLED-NEXT: add.rn.f32 -// DISABLED-NEXT: st.param.f32 +// AMD-OPT-FAST-IR: fmul float +// AMD-OPT-FAST-IR: fadd float +// AMD-OPT-FAST: v_mul_f32_e32 +// AMD-OPT-FAST-NEXT: v_add_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-ON-IR: fmul float +// AMD-OPT-ON-IR: fadd float +// AMD-OPT-ON: v_mul_f32_e32 +// AMD-OPT-ON-NEXT: v_add_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64