diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -3209,8 +3209,9 @@ The pragma can also be used with ``off`` which turns FP contraction off for a section of the code. This can be useful when fast contraction is otherwise -enabled for the translation unit with the ``-ffp-contract=fast`` flag. - +enabled for the translation unit with the ``-ffp-contract=fast-honor-pragmas`` flag. +Note that ``-ffp-contract=fast`` will override pragmas to fuse multiply and +addition across statements regardless of any controlling pragmas. ``#pragma clang fp exceptions`` specifies floating point exception behavior. It may take one the the values: ``ignore``, ``maytrap`` or ``strict``. Meaning of diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -1336,15 +1336,16 @@ The C standard permits intermediate floating-point results within an expression to be computed with more precision than their type would normally allow. This permits operation fusing, and Clang takes advantage - of this by default. This behavior can be controlled with the - ``FP_CONTRACT`` pragma. Please refer to the pragma documentation for a - description of how the pragma interacts with this option. + of this by default. This behavior can be controlled with the ``FP_CONTRACT`` + and ``clang fp contract`` pragmas. Please refer to the pragma documentation + for a description of how the pragmas interact with this option. Valid values are: - * ``fast`` (everywhere) - * ``on`` (according to FP_CONTRACT pragma, default) + * ``fast`` (fuse across statements disregarding pragmas, default for CUDA) + * ``on`` (fuse in the same statement unless dictated by pragmas, default for languages other than CUDA/HIP) * ``off`` (never fuse) + * ``fast-honor-pragmas`` (fuse across statements unless dictated by pragmas, default for HIP) .. _opt_fhonor-infinities: diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -187,8 +187,11 @@ // Enable the floating point pragma FPM_On, - // Aggressively fuse FP ops (E.g. FMA). - FPM_Fast + // Aggressively fuse FP ops (E.g. FMA) disregarding pragmas. + FPM_Fast, + + // Aggressively fuse FP ops and honor pragmas. + FPM_FastHonorPragmas }; /// Alias for RoundingMode::NearestTiesToEven. @@ -417,7 +420,13 @@ } explicit FPOptions(const LangOptions &LO) { Value = 0; - setFPContractMode(LO.getDefaultFPContractMode()); + // The language fp contract option FPM_FastHonorPragmas has the same effect + // as FPM_Fast in frontend. For simplicity, use FPM_Fast uniformly in + // frontend. + auto LangOptContractMode = LO.getDefaultFPContractMode(); + if (LangOptContractMode == LangOptions::FPM_FastHonorPragmas) + LangOptContractMode = LangOptions::FPM_Fast; + setFPContractMode(LangOptContractMode); setRoundingMode(LO.getFPRoundingMode()); setFPExceptionMode(LO.getFPExceptionMode()); setAllowFPReassociate(LO.AllowFPReassoc); diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1276,9 +1276,13 @@ def ftrapping_math : Flag<["-"], "ftrapping-math">, Group, Flags<[CC1Option]>; def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group, Flags<[CC1Option]>; def ffp_contract : Joined<["-"], "ffp-contract=">, Group, - Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)" - " | on (according to FP_CONTRACT pragma) | off (never fuse). Default" - " is 'fast' for CUDA/HIP and 'on' otherwise.">, Values<"fast,on,off">; + Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs):" + " fast (fuses across statements disregarding pragmas)" + " | on (only fuses in the same statement unless dictated by pragmas)" + " | off (never fuses)" + " | fast-honor-pragmas (fuses across statements unless diectated by pragmas)." + " Default is 'fast' for CUDA, 'fast-honor-pragmas' for HIP, and 'on' otherwise.">, + Values<"fast,on,off,fast-honor-pragmas">; defm strict_float_cast_overflow : OptOutFFlag<"strict-float-cast-overflow", "Assume that overflowing float-to-int casts are undefined (default)", diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -481,6 +481,7 @@ Options.AllowFPOpFusion = llvm::FPOpFusion::Standard; break; case LangOptions::FPM_On: + case LangOptions::FPM_FastHonorPragmas: Options.AllowFPOpFusion = llvm::FPOpFusion::Standard; break; case LangOptions::FPM_Fast: diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -2424,9 +2424,20 @@ Opts.HIP = IK.getLanguage() == Language::HIP; Opts.CUDA = IK.getLanguage() == Language::CUDA || Opts.HIP; - if (Opts.CUDA) - // Set default FP_CONTRACT to FAST. + if (Opts.HIP) { + // HIP toolchain does not support 'Fast' FPOpFusion in backends since it + // fuses multiplication/addition instructions without contract flag from + // device library functions in LLVM bitcode, which causes accuracy loss in + // certain math functions, e.g. tan(-1e20) becomes -0.933 instead of 0.8446. + // For device library functions in bitcode to work, 'Strict' or 'Standard' + // FPOpFusion options in backends is needed. Therefore 'fast-honor-pragmas' + // FP contract option is used to allow fuse across statements in frontend + // whereas respecting contract flag in backend. + Opts.setDefaultFPContractMode(LangOptions::FPM_FastHonorPragmas); + } else if (Opts.CUDA) { + // Allow fuse across statements disregarding pragmas. Opts.setDefaultFPContractMode(LangOptions::FPM_Fast); + } Opts.RenderScript = IK.getLanguage() == Language::RenderScript; if (Opts.RenderScript) { @@ -3343,6 +3354,8 @@ Opts.setDefaultFPContractMode(LangOptions::FPM_On); else if (Val == "off") Opts.setDefaultFPContractMode(LangOptions::FPM_Off); + else if (Val == "fast-honor-pragmas") + Opts.setDefaultFPContractMode(LangOptions::FPM_FastHonorPragmas); else Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val; } diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp --- a/clang/lib/Sema/SemaAttr.cpp +++ b/clang/lib/Sema/SemaAttr.cpp @@ -966,6 +966,8 @@ case LangOptions::FPM_Off: NewFPFeatures.setDisallowFPContract(); break; + case LangOptions::FPM_FastHonorPragmas: + llvm_unreachable("Should not happen"); } FpPragmaStack.Act(Loc, Sema::PSK_Set, StringRef(), NewFPFeatures); CurFPFeatures = NewFPFeatures.applyOverrides(getLangOpts()); diff --git a/clang/test/CodeGenCUDA/fp-contract.cu b/clang/test/CodeGenCUDA/fp-contract.cu --- a/clang/test/CodeGenCUDA/fp-contract.cu +++ b/clang/test/CodeGenCUDA/fp-contract.cu @@ -1,32 +1,298 @@ -// REQUIRES: x86-registered-target -// REQUIRES: nvptx-registered-target +// REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target -// By default we should fuse multiply/add into fma instruction. +// By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=fast-honor-pragmas. +// we should fuse multiply/add into fma instruction. +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx - assumes fast fp fuse option, which fuses +// mult/add insts disregarding contract flag and +// llvm.fmuladd intrinsics. +// amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag and +// llvm.fmuladd intrinsics. + +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ -// RUN: -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s +// RUN: -O3 -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s + +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s // Explicit -ffp-contract=fast +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes fast fp fuse option, which fuses +// mult/add insts disregarding contract flag and +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=fast -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix ENABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=fast \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s + +// Check separate compile/backend steps corresponding to -save-temps. +// When input is IR, -ffp-contract has no effect. Backend uses default +// default FP fuse option. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=fast \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s + +// Explicit -ffp-contract=fast-honor-pragmas +// In IR, fmul/fadd instructions with contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -ffp-contract=fast-honor-pragmas -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=fast-honor-pragmas \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=fast-honor-pragmas \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=fast-honor-pragmas \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s + +// Check separate compile/backend steps corresponding to -save-temps. +// When input is IR, -ffp-contract has no effect. Backend uses default +// default FP fuse option. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=fast-honor-pragmas \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s // Explicit -ffp-contract=on -- fusing by front-end. +// In IR, +// mult/add in the same statement - llvm.fmuladd instrinsic emitted +// mult/add in different statement - fmul/fadd instructions without +// contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=on -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix ENABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=on \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s + +// Check separate compile/backend steps corresponding to -save-temps. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=on \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s // Explicit -ffp-contract=off should disable instruction fusing. +// In IR, fmul/fadd instructions without contract flag are emitted. +// In backend +// nvptx/amdgcn - assumes standard fp fuse option, which only +// fuses mult/add insts with contract flag or +// llvm.fmuladd intrinsics. + // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -ffp-contract=off -disable-llvm-passes -o - %s \ -// RUN: | FileCheck -check-prefix DISABLED %s +// RUN: | FileCheck -check-prefixes=COMMON,NV-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -O3 -o - %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x hip %s \ +// RUN: -ffp-contract=off \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s + +// Check separate compile/backend steps corresponding to -save-temps. +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -ffp-contract=off \ +// RUN: -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s +// RUN: cat %t.ll | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \ +// RUN: -O3 -target-cpu gfx906 -o - -x ir %t.ll \ +// RUN: | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s #include "Inputs/cuda.h" +// Test multiply/add in the same statement, which can be emitted as FMA when +// fp-contract is on or fast. __host__ __device__ float func(float a, float b, float c) { return a + b * c; } -// ENABLED: fma.rn.f32 -// ENABLED-NEXT: st.param.f32 +// COMMON-LABEL: _Z4funcfff +// NV-ON: fma.rn.f32 +// NV-ON-NEXT: st.param.f32 +// AMD-ON: v_fmac_f32_e64 +// AMD-ON-NEXT: s_setpc_b64 + +// NV-OFF: mul.rn.f32 +// NV-OFF-NEXT: add.rn.f32 +// NV-OFF-NEXT: st.param.f32 +// AMD-OFF: v_mul_f32_e64 +// AMD-OFF-NEXT: v_add_f32_e64 +// AMD-OFF-NEXT: s_setpc_b64 + +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FASTSTD: fma.rn.f32 +// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-ON: fma.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-OFF: mul.rn.f32 +// NV-OPT-OFF-NEXT: add.rn.f32 +// NV-OPT-OFF-NEXT: st.param.f32 + +// AMD-OPT-FAST-IR: fmul contract float +// AMD-OPT-FAST-IR: fadd contract float +// AMD-OPT-ON-IR: @llvm.fmuladd.f32 +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float + +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-FASTSTD: v_fmac_f32_e32 +// AMD-OPT-FASTSTD-NEXT: s_setpc_b64 +// AMD-OPT-ON: v_fmac_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 + +// Test multiply/add in the different statements, which can be emitted as +// FMA when fp-contract is fast but not on. +__host__ __device__ float func2(float a, float b, float c) { + float t = b * c; + return t + a; +} +// COMMON-LABEL: _Z5func2fff +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FASTSTD: fma.rn.f32 +// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-ON: mul.rn.f32 +// NV-OPT-ON: add.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-OFF: mul.rn.f32 +// NV-OPT-OFF: add.rn.f32 +// NV-OPT-OFF-NEXT: st.param.f32 + +// AMD-OPT-FAST-IR: fmul contract float +// AMD-OPT-FAST-IR: fadd contract float +// AMD-OPT-ON-IR: fmul float +// AMD-OPT-ON-IR: fadd float +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float + +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-FASTSTD: v_fmac_f32_e32 +// AMD-OPT-FASTSTD-NEXT: s_setpc_b64 +// AMD-OPT-ON: v_mul_f32_e32 +// AMD-OPT-ON-NEXT: v_add_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 + +// Test multiply/add in the different statements, which is forced +// to be compiled with fp contract on. fmul/fadd without contract +// flags are emitted in IR. In nvptx, they are emitted as FMA in +// fp-contract is fast but not on, as nvptx backend uses the same +// fp fuse option as front end, whereas fast fp fuse option in +// backend fuses fadd/fmul disregarding contract flag. In amdgcn +// they are not fused as amdgcn always use standard fp fusion +// option which respects contract flag. + __host__ __device__ float func3(float a, float b, float c) { +#pragma clang fp contract(on) + float t = b * c; + return t + a; +} +// COMMON-LABEL: _Z5func3fff +// NV-OPT-FAST: fma.rn.f32 +// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FASTSTD: mul.rn.f32 +// NV-OPT-FASTSTD: add.rn.f32 +// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-ON: mul.rn.f32 +// NV-OPT-ON: add.rn.f32 +// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-OFF: mul.rn.f32 +// NV-OPT-OFF: add.rn.f32 +// NV-OPT-OFF-NEXT: st.param.f32 + +// AMD-OPT-FAST-IR: fmul float +// AMD-OPT-FAST-IR: fadd float +// AMD-OPT-ON-IR: fmul float +// AMD-OPT-ON-IR: fadd float +// AMD-OPT-OFF-IR: fmul float +// AMD-OPT-OFF-IR: fadd float -// DISABLED: mul.rn.f32 -// DISABLED-NEXT: add.rn.f32 -// DISABLED-NEXT: st.param.f32 +// AMD-OPT-FAST: v_fmac_f32_e32 +// AMD-OPT-FAST-NEXT: s_setpc_b64 +// AMD-OPT-FASTSTD: v_mul_f32_e32 +// AMD-OPT-FASTSTD-NEXT: v_add_f32_e32 +// AMD-OPT-FASTSTD-NEXT: s_setpc_b64 +// AMD-OPT-ON: v_mul_f32_e32 +// AMD-OPT-ON-NEXT: v_add_f32_e32 +// AMD-OPT-ON-NEXT: s_setpc_b64 +// AMD-OPT-OFF: v_mul_f32_e32 +// AMD-OPT-OFF-NEXT: v_add_f32_e32 +// AMD-OPT-OFF-NEXT: s_setpc_b64 diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c --- a/clang/test/Driver/autocomplete.c +++ b/clang/test/Driver/autocomplete.c @@ -68,6 +68,7 @@ // FNOSANICOVERALL-NEXT: trace-pc-guard // RUN: %clang --autocomplete=-ffp-contract= | FileCheck %s -check-prefix=FFPALL // FFPALL: fast +// FFPALL-NEXT: fast-honor-pragmas // FFPALL-NEXT: off // FFPALL-NEXT: on // RUN: %clang --autocomplete=-flto= | FileCheck %s -check-prefix=FLTOALL