Index: clang/include/clang/Basic/LangOptions.h =================================================================== --- clang/include/clang/Basic/LangOptions.h +++ clang/include/clang/Basic/LangOptions.h @@ -182,8 +182,11 @@ // Enable the floating point pragma FPM_On, - // Aggressively fuse FP ops (E.g. FMA). - FPM_Fast + // Aggressively fuse FP ops (E.g. FMA) disregarding pragmas. + FPM_Fast, + + // Aggressively fuse FP ops and respect pragmas. + FPM_FastStd }; /// Alias for RoundingMode::NearestTiesToEven. @@ -405,7 +408,13 @@ } explicit FPOptions(const LangOptions &LO) { Value = 0; - setFPContractMode(LO.getDefaultFPContractMode()); + // The language fp contract option FPM_FastStd has the same effect as + // FPM_Fast in frontend. For simplicity, use FPM_Fast uniformly in + // frontend. + auto LangOptContractMode = LO.getDefaultFPContractMode(); + if (LangOptContractMode == LangOptions::FPM_FastStd) + LangOptContractMode = LangOptions::FPM_Fast; + setFPContractMode(LangOptContractMode); setRoundingMode(LO.getFPRoundingMode()); setFPExceptionMode(LO.getFPExceptionMode()); setAllowFEnvAccess(LangOptions::FPM_Off); Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -1230,9 +1230,13 @@ def ftrapping_math : Flag<["-"], "ftrapping-math">, Group, Flags<[CC1Option]>; def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group, Flags<[CC1Option]>; def ffp_contract : Joined<["-"], "ffp-contract=">, Group, - Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)" - " | on (according to FP_CONTRACT pragma) | off (never fuse). Default" - " is 'fast' for CUDA/HIP and 'on' otherwise.">, Values<"fast,on,off">; + Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs):" + " fast (fuses across statements disregarding pragmas)" + " | on (only fuses in the same statement unless dictated by pragmas)" + " | off (never fuses)" + " | faststd (fuses across statements unless diectated by pragmas)." + " Default is 'fast' for CUDA, 'faststd' for HIP, and 'on' otherwise.">, + Values<"fast,on,off,faststd">; defm strict_float_cast_overflow : OptOutFFlag<"strict-float-cast-overflow", "Assume that overflowing float-to-int casts are undefined (default)", Index: clang/lib/CodeGen/BackendUtil.cpp =================================================================== --- clang/lib/CodeGen/BackendUtil.cpp +++ clang/lib/CodeGen/BackendUtil.cpp @@ -466,6 +466,7 @@ Options.AllowFPOpFusion = llvm::FPOpFusion::Standard; break; case LangOptions::FPM_On: + case LangOptions::FPM_FastStd: Options.AllowFPOpFusion = llvm::FPOpFusion::Standard; break; case LangOptions::FPM_Fast: Index: clang/lib/Frontend/CompilerInvocation.cpp =================================================================== --- clang/lib/Frontend/CompilerInvocation.cpp +++ clang/lib/Frontend/CompilerInvocation.cpp @@ -2399,9 +2399,20 @@ Opts.HIP = IK.getLanguage() == Language::HIP; Opts.CUDA = IK.getLanguage() == Language::CUDA || Opts.HIP; - if (Opts.CUDA) - // Set default FP_CONTRACT to FAST. + if (Opts.HIP) { + // HIP toolchain does not support 'Fast' FPOpFusion in backends since it + // fuses multiplication/addition instructions without contract flag from + // device library functions in LLVM bitcode, which causes accuracy loss in + // certain math functions, e.g. tan(-1e20) becomes -0.933 instead of 0.8446. + // For device library functions in bitcode to work, 'Strict' or 'Standard' + // FPOpFusion options in backends is needed. Therefore 'faststd' FP contract + // option is used to allow fuse across statements in frontend whereas + // respecting contract flag in backend. + Opts.setDefaultFPContractMode(LangOptions::FPM_FastStd); + } else if (Opts.CUDA) { + // Allow fuse across statements disregarding pragmas. Opts.setDefaultFPContractMode(LangOptions::FPM_Fast); + } Opts.RenderScript = IK.getLanguage() == Language::RenderScript; if (Opts.RenderScript) { @@ -3330,6 +3341,8 @@ Opts.setDefaultFPContractMode(LangOptions::FPM_On); else if (Val == "off") Opts.setDefaultFPContractMode(LangOptions::FPM_Off); + else if (Val == "faststd") + Opts.setDefaultFPContractMode(LangOptions::FPM_FastStd); else Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val; } Index: clang/test/CodeGenCUDA/fp-contract.cu =================================================================== --- clang/test/CodeGenCUDA/fp-contract.cu +++ clang/test/CodeGenCUDA/fp-contract.cu @@ -1,32 +1,298 @@ -// REQUIRES: x86-registered-target -// REQUIRES: nvptx-registered-target +// REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target -// By default we should fuse multiply/add into fma instruction. +// By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=faststd. +// we should fuse multiply/add into fma instruction. +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx - assumes fast fp fuse option, which fuses +// mult/add insts disregarding contract flag and +// llvm.fmuladd intrinsics. +// amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag and +// llvm.fmuladd intrinsics. + +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ -// RUN: -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s +// RUN: -O3 -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s + +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s // Explicit -ffp-contract=fast +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes fast fp fuse option, which fuses +// mult/add insts disregarding contract flag and +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=fast -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix ENABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s + +// Check separate compile/backend steps corresponding to -save-temps. +// When input is IR, -ffp-contract has no effect. Backend uses default +// default FP fuse option. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=fast \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s + +// Explicit -ffp-contract=faststd +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -ffp-contract=faststd -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=faststd \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=faststd \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=faststd \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s + +// Check separate compile/backend steps corresponding to -save-temps. +// When input is IR, -ffp-contract has no effect. Backend uses default +// default FP fuse option. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=faststd \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s // Explicit -ffp-contract=on -- fusing by front-end. +// In IR, +// mult/add in the same statement - llvm.fmuladd instrinsic emitted +// mult/add in different statement - fmul/fadd instructions without +// contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=on -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix ENABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s + +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=on \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s // Explicit -ffp-contract=off should disable instruction fusing. +// In IR, fmul/fadd instructions without contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=off -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix DISABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s + +// Check separate compile/backend steps corresponding to -save-temps. +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=off \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s #include "Inputs/cuda.h" +// Test multiply/add in the same statement, which can be emitted as FMA when +// fp-contract is on or fast. __host__ __device__ float func(float a, float b, float c) { return a + b * c; } -// ENABLED: fma.rn.f32 -// ENABLED-NEXT: st.param.f32 +// COMMON-LABEL: _Z4funcfff +// NV-ON: fma.rn.f32 +// NV-ON-NEXT: st.param.f32 +// AMD-ON: v_fmac_f32_e64 +// AMD-ON-NEXT: s_setpc_b64 + +// NV-OFF: mul.rn.f32 +// NV-OFF-NEXT: add.rn.f32 +// NV-OFF-NEXT: st.param.f32 +// AMD-OFF: v_mul_f32_e64 +// AMD-OFF-NEXT: v_add_f32_e64 +// AMD-OFF-NEXT: s_setpc_b64 + +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FASTSTD: fma.rn.f32 +// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-ON: fma.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-OFF: mul.rn.f32 +// NV-OPT-OFF-NEXT: add.rn.f32 +// NV-OPT-OFF-NEXT: st.param.f32 + +// AMD-OPT-FAST-IR: fmul contract float +// AMD-OPT-FAST-IR: fadd contract float +// AMD-OPT-ON-IR: @llvm.fmuladd.f32 +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float + +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-FASTSTD: v_fmac_f32_e32 +// AMD-OPT-FASTSTD-NEXT: s_setpc_b64 +// AMD-OPT-ON: v_fmac_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 + +// Test multiply/add in the different statements, which can be emitted as +// FMA when fp-contract is fast but not on. +__host__ __device__ float func2(float a, float b, float c) { + float t = b * c; + return t + a; +} +// COMMON-LABEL: _Z5func2fff +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FASTSTD: fma.rn.f32 +// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-ON: mul.rn.f32 +// NV-OPT-ON: add.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-OFF: mul.rn.f32 +// NV-OPT-OFF: add.rn.f32 +// NV-OPT-OFF-NEXT: st.param.f32 + +// AMD-OPT-FAST-IR: fmul contract float +// AMD-OPT-FAST-IR: fadd contract float +// AMD-OPT-ON-IR: fmul float +// AMD-OPT-ON-IR: fadd float +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float + +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-FASTSTD: v_fmac_f32_e32 +// AMD-OPT-FASTSTD-NEXT: s_setpc_b64 +// AMD-OPT-ON: v_mul_f32_e32 +// AMD-OPT-ON-NEXT: v_add_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 + +// Test multiply/add in the different statements, which is forced +// to be compiled with fp contract on. fmul/fadd without contract +// flags are emitted in IR. In nvptx, they are emitted as FMA in +// fp-contract is fast but not on, as nvptx backend uses the same +// fp fuse option as front end, whereas fast fp fuse option in +// backend fuses fadd/fmul disregarding contract flag. In amdgcn +// they are not fused as amdgcn always use standard fp fusion +// option which respects contract flag. + __host__ __device__ float func3(float a, float b, float c) { +#pragma clang fp contract(on) + float t = b * c; + return t + a; +} +// COMMON-LABEL: _Z5func3fff +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FASTSTD: mul.rn.f32 +// NV-OPT-FASTSTD: add.rn.f32 +// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-ON: mul.rn.f32 +// NV-OPT-ON: add.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-OFF: mul.rn.f32 +// NV-OPT-OFF: add.rn.f32 +// NV-OPT-OFF-NEXT: st.param.f32 + +// AMD-OPT-FAST-IR: fmul float +// AMD-OPT-FAST-IR: fadd float +// AMD-OPT-ON-IR: fmul float +// AMD-OPT-ON-IR: fadd float +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float -// DISABLED: mul.rn.f32 -// DISABLED-NEXT: add.rn.f32 -// DISABLED-NEXT: st.param.f32 +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-FASTSTD: v_mul_f32_e32 +// AMD-OPT-FASTSTD-NEXT: v_add_f32_e32 +// AMD-OPT-FASTSTD-NEXT: s_setpc_b64 +// AMD-OPT-ON: v_mul_f32_e32 +// AMD-OPT-ON-NEXT: v_add_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 Index: clang/test/Driver/autocomplete.c =================================================================== --- clang/test/Driver/autocomplete.c +++ clang/test/Driver/autocomplete.c @@ -66,6 +66,7 @@ // FNOSANICOVERALL-NEXT: trace-pc-guard // RUN: %clang --autocomplete=-ffp-contract= | FileCheck %s -check-prefix=FFPALL // FFPALL: fast +// FFPALL-NEXT: faststd // FFPALL-NEXT: off // FFPALL-NEXT: on // RUN: %clang --autocomplete=-flto= | FileCheck %s -check-prefix=FLTOALL