Index: clang/include/clang/Basic/CodeGenOptions.h =================================================================== --- clang/include/clang/Basic/CodeGenOptions.h +++ clang/include/clang/Basic/CodeGenOptions.h @@ -166,6 +166,9 @@ /// The floating-point denormal mode to use. llvm::DenormalMode FPDenormalMode = llvm::DenormalMode::Invalid; + /// The floating-point subnormal mode to use, for float. + llvm::DenormalMode FP32DenormalMode = llvm::DenormalMode::Invalid; + /// The float precision limit to use, if non-empty. std::string LimitFloatPrecision; Index: clang/include/clang/Basic/CodeGenOptions.def =================================================================== --- clang/include/clang/Basic/CodeGenOptions.def +++ clang/include/clang/Basic/CodeGenOptions.def @@ -153,7 +153,6 @@ CODEGENOPT(ReciprocalMath , 1, 0) ///< Allow FP divisions to be reassociated. CODEGENOPT(NoTrappingMath , 1, 0) ///< Set when -fno-trapping-math is enabled. CODEGENOPT(NoNaNsFPMath , 1, 0) ///< Assume FP arguments, results not NaN. -CODEGENOPT(FlushDenorm , 1, 0) ///< Allow FP denorm numbers to be flushed to zero CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt /// When false, this attempts to generate code as if the result of an Index: clang/include/clang/Driver/CC1Options.td =================================================================== --- clang/include/clang/Driver/CC1Options.td +++ clang/include/clang/Driver/CC1Options.td @@ -405,6 +405,9 @@ def cfguard : Flag<["-"], "cfguard">, HelpText<"Emit Windows Control Flow Guard tables and checks">; +def fdenormal_fp_math_f32_EQ : Joined<["-"], "fdenormal-fp-math-f32=">, + Group; + //===----------------------------------------------------------------------===// // Dependency Output Options //===----------------------------------------------------------------------===// Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -523,7 +523,7 @@ HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">; def cl_std_EQ : Joined<["-"], "cl-std=">, Group, Flags<[CC1Option]>, HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0,clc++,CLC++">; -def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group, Flags<[CC1Option]>, +def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group, HelpText<"OpenCL only. Allow denormals to be flushed to zero.">; def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group, Flags<[CC1Option]>, HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">; @@ -581,7 +581,7 @@ def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group, HelpText<"Path to ptxas (used for compiling CUDA code)">; def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">, - Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">; + HelpText<"Flush denormal floating point values to zero in CUDA device mode.">; def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">; def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">, Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">; Index: clang/include/clang/Driver/ToolChain.h =================================================================== --- clang/include/clang/Driver/ToolChain.h +++ clang/include/clang/Driver/ToolChain.h @@ -16,7 +16,9 @@ #include "clang/Driver/Action.h" #include "clang/Driver/Multilib.h" #include "clang/Driver/Types.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" @@ -606,6 +608,17 @@ /// Returns true when it's possible to split LTO unit to use whole /// program devirtualization and CFI santiizers. virtual bool canSplitThinLTOUnit() const { return true; } + + /// Returns the output denormal handling type in the default floating point + /// environment for the given \p FPType if given. Otherwise, the default + /// assumed mode for any floating point type. + virtual llvm::DenormalMode getDefaultDenormalModeForType( + const llvm::opt::ArgList &DriverArgs, + Action::OffloadKind DeviceOffloadKind, + const llvm::fltSemantics *FPType = nullptr) const { + // FIXME: This should be IEEE when default handling is fixed. + return llvm::DenormalMode::Invalid; + } }; /// Set a ToolChain's effective triple. Reset it when the registration object Index: clang/lib/Basic/Targets/AMDGPU.cpp =================================================================== --- clang/lib/Basic/Targets/AMDGPU.cpp +++ clang/lib/Basic/Targets/AMDGPU.cpp @@ -241,7 +241,8 @@ } if (!hasFP32Denormals) TargetOpts.Features.push_back( - (Twine(hasFastFMAF() && hasFullRateDenormalsF32() && !CGOpts.FlushDenorm + (Twine(hasFastFMAF() && hasFullRateDenormalsF32() && + CGOpts.FP32DenormalMode == llvm::DenormalMode::IEEE ? '+' : '-') + Twine("fp32-denormals")) .str()); // Always do not flush fp64 or fp16 denorms. Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1747,10 +1747,17 @@ if (CodeGenOpts.NullPointerIsValid) FuncAttrs.addAttribute("null-pointer-is-valid", "true"); + + // TODO: Omit attribute when the default is IEEE. if (CodeGenOpts.FPDenormalMode != llvm::DenormalMode::Invalid) FuncAttrs.addAttribute("denormal-fp-math", llvm::denormalModeName(CodeGenOpts.FPDenormalMode)); + if (CodeGenOpts.FP32DenormalMode != llvm::DenormalMode::Invalid) + FuncAttrs.addAttribute( + "denormal-fp-math-f32", + llvm::denormalModeName(CodeGenOpts.FP32DenormalMode)); + FuncAttrs.addAttribute("no-trapping-math", llvm::toStringRef(CodeGenOpts.NoTrappingMath)); @@ -1777,10 +1784,6 @@ "correctly-rounded-divide-sqrt-fp-math", llvm::toStringRef(CodeGenOpts.CorrectlyRoundedDivSqrt)); - if (getLangOpts().OpenCL) - FuncAttrs.addAttribute("denorms-are-zero", - llvm::toStringRef(CodeGenOpts.FlushDenorm)); - // TODO: Reciprocal estimate codegen options should apply to instructions? const std::vector &Recips = CodeGenOpts.Reciprocals; if (!Recips.empty()) @@ -1813,10 +1816,6 @@ if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) { // Exceptions aren't supported in CUDA device code. FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); - - // Respect -fcuda-flush-denormals-to-zero. - if (CodeGenOpts.FlushDenorm) - FuncAttrs.addAttribute("nvptx-f32ftz", "true"); } for (StringRef Attr : CodeGenOpts.DefaultFunctionAttrs) { Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -567,7 +567,8 @@ // floating point values to 0. (This corresponds to its "__CUDA_FTZ" // property.) getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", - CodeGenOpts.FlushDenorm ? 1 : 0); + CodeGenOpts.FP32DenormalMode != + llvm::DenormalMode::IEEE); } // Emit OpenCL specific module metadata: OpenCL/SPIR version. Index: clang/lib/Driver/ToolChains/AMDGPU.h =================================================================== --- clang/lib/Driver/ToolChains/AMDGPU.h +++ clang/lib/Driver/ToolChains/AMDGPU.h @@ -66,6 +66,11 @@ void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadKind) const override; + + llvm::DenormalMode getDefaultDenormalModeForType( + const llvm::opt::ArgList &DriverArgs, + Action::OffloadKind DeviceOffloadKind, + const llvm::fltSemantics *FPType = nullptr) const override; }; } // end namespace toolchains Index: clang/lib/Driver/ToolChains/AMDGPU.cpp =================================================================== --- clang/lib/Driver/ToolChains/AMDGPU.cpp +++ clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -12,6 +12,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/DriverDiagnostic.h" #include "llvm/Option/ArgList.h" +#include "llvm/Support/TargetParser.h" using namespace clang::driver; using namespace clang::driver::tools; @@ -102,6 +103,40 @@ return DAL; } +llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType( + const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind, + const llvm::fltSemantics *FPType) const { + // Denormals should always be enabled for f16 and f64. + if (!FPType || FPType != &llvm::APFloat::IEEEsingle()) + return llvm::DenormalMode::IEEE; + + if (DeviceOffloadKind == Action::OFK_Cuda) { + if (FPType && FPType == &llvm::APFloat::IEEEsingle() && + DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, + options::OPT_fno_cuda_flush_denormals_to_zero, + false)) + return llvm::DenormalMode::PreserveSign; + } + + const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ); + auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch); + + // Default to enabling f32 denormals by default on subtargets where fma is + // fast with denormals + + const unsigned ArchAttr = llvm::AMDGPU::getArchAttrAMDGCN(Kind); + const bool DefaultDenormsAreZeroForTarget = + (ArchAttr & llvm::AMDGPU::FEATURE_FAST_FMA_F32) && + (ArchAttr & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32); + + // TODO: There are way too many flags that change this. Do we need to check + // them all? + bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) || + !DefaultDenormsAreZeroForTarget; + // Outputs are flushed to zero, preserving sign + return DAZ ? llvm::DenormalMode::PreserveSign : llvm::DenormalMode::IEEE; +} + void AMDGPUToolChain::addClangTargetOptions( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -2426,7 +2426,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, bool OFastEnabled, const ArgList &Args, - ArgStringList &CmdArgs) { + ArgStringList &CmdArgs, + Action::OffloadKind DeviceOffloadKind) { // Handle various floating point optimization flags, mapping them to the // appropriate LLVM code generation flags. This is complicated by several // "umbrella" flags, so we do this by stepping through the flags incrementally @@ -2439,8 +2440,7 @@ bool AssociativeMath = false; bool ReciprocalMath = false; bool SignedZeros = true; - bool TrappingMath = false; // Implemented via -ffp-exception-behavior - bool TrappingMathPresent = false; // Is trapping-math in args, and not + bool TrappingMath = true; // overriden by ffp-exception-behavior? bool RoundingFPMath = false; bool RoundingMathPresent = false; // Is rounding-math in args? @@ -2448,10 +2448,15 @@ StringRef FPModel = ""; // -ffp-exception-behavior options: strict, maytrap, ignore StringRef FPExceptionBehavior = ""; - StringRef DenormalFPMath = ""; + const llvm::DenormalMode DefaultDenormalFPMath = + TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind); + llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath; StringRef FPContract = ""; bool StrictFPModel = false; + llvm::DenormalMode DenormalFP32Math = TC.getDefaultDenormalModeForType( + Args, DeviceOffloadKind, &llvm::APFloat::IEEEsingle()); + if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) { CmdArgs.push_back("-mlimit-float-precision"); CmdArgs.push_back(A->getValue()); @@ -2567,7 +2572,19 @@ break; case options::OPT_fdenormal_fp_math_EQ: - DenormalFPMath = A->getValue(); + DenormalFPMath = llvm::parseDenormalFPAttribute(A->getValue()); + if (DenormalFPMath == llvm::DenormalMode::Invalid) { + D.Diag(diag::err_drv_invalid_value) + << A->getAsString(Args) << A->getValue(); + } + break; + + case options::OPT_fdenormal_fp_math_f32_EQ: + DenormalFP32Math = llvm::parseDenormalFPAttribute(A->getValue()); + if (DenormalFP32Math == llvm::DenormalMode::Invalid) { + D.Diag(diag::err_drv_invalid_value) + << A->getAsString(Args) << A->getValue(); + } break; // Validate and pass through -ffp-contract option. @@ -2637,7 +2654,7 @@ TrappingMath = true; FPExceptionBehavior = "strict"; // -fno_unsafe_math_optimizations restores default denormal handling - DenormalFPMath = ""; + DenormalFPMath = DefaultDenormalFPMath; break; case options::OPT_Ofast: @@ -2670,7 +2687,7 @@ TrappingMath = false; RoundingFPMath = false; // -fno_fast_math restores default denormal and fpcontract handling - DenormalFPMath = ""; + DenormalFPMath = DefaultDenormalFPMath; FPContract = ""; break; } @@ -2726,9 +2743,16 @@ } else if (TrappingMathPresent) CmdArgs.push_back("-fno-trapping-math"); - if (!DenormalFPMath.empty()) - CmdArgs.push_back( - Args.MakeArgString("-fdenormal-fp-math=" + DenormalFPMath)); + // TODO: Omit flag for the default IEEE instead + if (DenormalFPMath != llvm::DenormalMode::Invalid) { + CmdArgs.push_back(Args.MakeArgString( + "-fdenormal-fp-math=" + llvm::subnormalModeName(DenormalFPMath))); + } + + if (DenormalFP32Math != llvm::DenormalMode::Invalid) { + CmdArgs.push_back(Args.MakeArgString( + "-fdenormal-fp-math-f32=" + llvm::subnormalModeName(DenormalFP32Math))); + } if (!FPContract.empty()) CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + FPContract)); @@ -2948,6 +2972,8 @@ } static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs) { + // cl-denorms-are-zero is not forwarded. It is translated into a generic flag + // for denormal flushing handling based on the target. const unsigned ForwardedArguments[] = { options::OPT_cl_opt_disable, options::OPT_cl_strict_aliasing, @@ -2958,7 +2984,6 @@ options::OPT_cl_fast_relaxed_math, options::OPT_cl_mad_enable, options::OPT_cl_no_signed_zeros, - options::OPT_cl_denorms_are_zero, options::OPT_cl_fp32_correctly_rounded_divide_sqrt, options::OPT_cl_uniform_work_group_size }; @@ -4141,7 +4166,7 @@ CmdArgs.push_back("-mdisable-tail-calls"); RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args, - CmdArgs); + CmdArgs, JA.getOffloadingDeviceKind()); // Render ABI arguments switch (TC.getArch()) { @@ -4441,7 +4466,8 @@ if (Args.hasArg(options::OPT_fsplit_stack)) CmdArgs.push_back("-split-stacks"); - RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs); + RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, + JA.getOffloadingDeviceKind()); if (Arg *A = Args.getLastArg(options::OPT_LongDouble_Group)) { if (TC.getTriple().isX86()) Index: clang/lib/Driver/ToolChains/Cuda.h =================================================================== --- clang/lib/Driver/ToolChains/Cuda.h +++ clang/lib/Driver/ToolChains/Cuda.h @@ -149,6 +149,11 @@ llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadKind) const override; + llvm::DenormalMode getDefaultDenormalModeForType( + const llvm::opt::ArgList &DriverArgs, + Action::OffloadKind DeviceOffloadKind, + const llvm::fltSemantics *FPType = nullptr) const override; + // Never try to use the integrated assembler with CUDA; always fork out to // ptxas. bool useIntegratedAs() const override { return false; } Index: clang/lib/Driver/ToolChains/Cuda.cpp =================================================================== --- clang/lib/Driver/ToolChains/Cuda.cpp +++ clang/lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/VirtualFileSystem.h" #include @@ -614,10 +615,6 @@ if (DeviceOffloadingKind == Action::OFK_Cuda) { CC1Args.push_back("-fcuda-is-device"); - if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, - options::OPT_fno_cuda_flush_denormals_to_zero, false)) - CC1Args.push_back("-fcuda-flush-denormals-to-zero"); - if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); @@ -718,6 +715,21 @@ } } +llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType( + const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind, + const llvm::fltSemantics *FPType) const { + if (DeviceOffloadKind == Action::OFK_Cuda) { + if (FPType && FPType == &llvm::APFloat::IEEEsingle() && + DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, + options::OPT_fno_cuda_flush_denormals_to_zero, + false)) + return llvm::DenormalMode::PreserveSign; + } + + assert(DeviceOffloadKind != Action::OFK_Host); + return llvm::DenormalMode::IEEE; +} + bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const { const Option &O = A->getOption(); return (O.matches(options::OPT_gN_Group) && Index: clang/lib/Driver/ToolChains/HIP.cpp =================================================================== --- clang/lib/Driver/ToolChains/HIP.cpp +++ clang/lib/Driver/ToolChains/HIP.cpp @@ -295,10 +295,6 @@ CC1Args.push_back(DriverArgs.MakeArgStringRef(GpuArch)); CC1Args.push_back("-fcuda-is-device"); - if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, - options::OPT_fno_cuda_flush_denormals_to_zero, false)) - CC1Args.push_back("-fcuda-flush-denormals-to-zero"); - if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); Index: clang/lib/Frontend/CompilerInvocation.cpp =================================================================== --- clang/lib/Frontend/CompilerInvocation.cpp +++ clang/lib/Frontend/CompilerInvocation.cpp @@ -910,9 +910,6 @@ Args.hasArg(OPT_cl_unsafe_math_optimizations) || Args.hasArg(OPT_cl_fast_relaxed_math)); Opts.Reassociate = Args.hasArg(OPT_mreassociate); - Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) || - (Args.hasArg(OPT_fcuda_is_device) && - Args.hasArg(OPT_fcuda_flush_denormals_to_zero)); Opts.CorrectlyRoundedDivSqrt = Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt); Opts.UniformWGSize = @@ -1277,6 +1274,13 @@ Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val; } + if (Arg *A = Args.getLastArg(OPT_fdenormal_fp_math_f32_EQ)) { + StringRef Val = A->getValue(); + Opts.FP32DenormalMode = llvm::parseDenormalFPAttribute(Val); + if (Opts.FP32DenormalMode == llvm::DenormalMode::Invalid) + Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val; + } + if (Arg *A = Args.getLastArg(OPT_fpcc_struct_return, OPT_freg_struct_return)) { if (A->getOption().matches(OPT_fpcc_struct_return)) { Opts.setStructReturnConvention(CodeGenOptions::SRCK_OnStack); Index: clang/test/CodeGenCUDA/flush-denormals.cu =================================================================== --- clang/test/CodeGenCUDA/flush-denormals.cu +++ clang/test/CodeGenCUDA/flush-denormals.cu @@ -1,23 +1,34 @@ // RUN: %clang_cc1 -fcuda-is-device \ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ -// RUN: FileCheck %s -check-prefix CHECK -check-prefix NOFTZ -// RUN: %clang_cc1 -fcuda-is-device -fcuda-flush-denormals-to-zero \ +// RUN: FileCheck -check-prefix=DEFAULT %s + +// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math-f32=ieee \ +// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefix=NOFTZ %s + +// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math-f32=preserve-sign \ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ -// RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ +// RUN: FileCheck -check-prefix=FTZ %s +// FIXME: Unspecified should default to ieee // RUN: %clang_cc1 -fcuda-is-device -x hip \ // RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ -// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ -// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \ +// RUN: FileCheck -check-prefix=AMDFTZ %s + +// RUN: %clang_cc1 -fcuda-is-device -x hip \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -fdenormal-fp-math-f32=ieee -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefix=AMDNOFTZ %s + +// RUN: %clang_cc1 -fcuda-is-device -x hip -fdenormal-fp-math-f32=preserve-sign \ // RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ -// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ +// RUN: FileCheck -check-prefix=AMDFTZ %s #include "Inputs/cuda.h" -// Checks that device function calls get emitted with the "ntpvx-f32ftz" -// attribute set to "true" when we compile CUDA device code with -// -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence -// or absence of -fcuda-flush-denormals-to-zero in a module flag. +// Checks that device function calls get emitted with the "denormal-fp-math-f32" +// attribute set when we compile CUDA device code with +// -fdenormal-fp-math-f32. Further, check that we reflect the presence or +// absence of -fcuda-flush-denormals-to-zero in a module flag. // AMDGCN targets always have +fp64-fp16-denormals. // AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals. @@ -28,8 +39,13 @@ // CHECK-LABEL: define void @foo() #0 extern "C" __device__ void foo() {} -// FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" -// NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" +// FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign" +// NOFTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="ieee" + + +// FIXME: This should be removed +// DEFAULT-NOT: "denormal-fp-math-f32" + // AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals // AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals Index: clang/test/CodeGenCUDA/propagate-metadata.cu =================================================================== --- clang/test/CodeGenCUDA/propagate-metadata.cu +++ clang/test/CodeGenCUDA/propagate-metadata.cu @@ -15,17 +15,17 @@ // RUN: %s -o %t.bc -triple nvptx-unknown-unknown // RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc -o - \ -// RUN: -fno-trapping-math -fcuda-is-device -triple nvptx-unknown-unknown \ +// RUN: -fno-trapping-math -fcuda-is-device -fdenormal-fp-math-f32=ieee -triple nvptx-unknown-unknown \ // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ --check-prefix=NOFAST // RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc \ -// RUN: -fno-trapping-math -fcuda-flush-denormals-to-zero -o - \ +// RUN: -fno-trapping-math -fdenormal-fp-math-f32=preserve-sign -o - \ // RUN: -fcuda-is-device -triple nvptx-unknown-unknown \ // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ \ // RUN: --check-prefix=NOFAST // RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc \ -// RUN: -fno-trapping-math -fcuda-flush-denormals-to-zero -o - \ +// RUN: -fno-trapping-math -fdenormal-fp-math-f32=preserve-sign -o - \ // RUN: -fcuda-is-device -menable-unsafe-fp-math -triple nvptx-unknown-unknown \ // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=FAST @@ -51,13 +51,21 @@ // CHECK: define void @kernel() [[attr:#[0-9]+]] // CHECK: define internal void @lib_fn() [[attr]] +// FIXME: These -NOT checks do not work as intended and do not check on the same +// line. + // Check the attribute list. // CHECK: attributes [[attr]] = { + // CHECK-SAME: convergent // CHECK-SAME: "no-trapping-math"="true" -// FTZ-SAME: "nvptx-f32ftz"="true" -// NOFTZ-NOT: "nvptx-f32ftz"="true" +// FTZ-NOT: "denormal-fp-math" + +// FTZ-SAME: "denormal-fp-math-f32"="preserve-sign" +// NOFTZ-SAME: "denormal-fp-math-f32"="ieee" + +// CHECK-SAME: "no-trapping-math"="true" // FAST-SAME: "unsafe-fp-math"="true" // NOFAST-NOT: "unsafe-fp-math"="true" Index: clang/test/CodeGenOpenCL/amdgpu-features.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-features.cl +++ clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -14,13 +14,13 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX600 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX601 %s -// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime" +// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals" +// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals" +// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals" +// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals" +// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals" +// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals" +// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime,-fp32-denormals" // GFX700: "target-features"="+ci-insts,+flat-address-space,+fp64-fp16-denormals,-fp32-denormals" // GFX600: "target-features"="+fp64-fp16-denormals,-fp32-denormals" // GFX601: "target-features"="+fp64-fp16-denormals,-fp32-denormals" Index: clang/test/CodeGenOpenCL/denorms-are-zero.cl =================================================================== --- clang/test/CodeGenOpenCL/denorms-are-zero.cl +++ /dev/null @@ -1,45 +0,0 @@ -// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - %s | FileCheck -check-prefix=DENORM-ZERO %s - -// Slow FMAF and slow f32 denormals -// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s -// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s - -// Fast FMAF, but slow f32 denormals -// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu tahiti %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s -// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu tahiti %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s - -// Fast F32 denormals, but slow FMAF -// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s -// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s - -// Fast F32 denormals and fast FMAF -// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu gfx900 %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-DENORM %s -// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu gfx900 %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s - -// RUN: %clang_cc1 -emit-llvm -target-feature +fp32-denormals -target-feature -fp64-fp16-denormals -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FEATURE %s -// RUN: %clang_cc1 -emit-llvm -target-feature +fp32-denormals -target-feature -fp64-fp16-denormals -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FEATURE %s - - - -// For all targets 'denorms-are-zero' attribute is set to 'true' -// if '-cl-denorms-are-zero' was specified and to 'false' otherwise. - -// CHECK-LABEL: define {{(dso_local )?}}void @f() -// CHECK: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false" -// -// DENORM-ZERO-LABEL: define {{(dso_local )?}}void @f() -// DENORM-ZERO: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true" - -// For amdgcn target cpu fiji, fp32 should be flushed since fiji does not support fp32 denormals, unless +fp32-denormals is -// explicitly set. amdgcn target always do not flush fp64 denormals. The control for fp64 and fp16 denormals is the same. - -// AMDGCN-LABEL: define void @f() - -// AMDGCN-FLUSH: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false" {{.*}} "target-features"="{{[^"]*}}+fp64-fp16-denormals,{{[^"]*}}-fp32-denormals{{[^"]*}}" -// AMDGCN-FLUSH-OPT: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true" {{.*}} "target-features"="{{[^"]*}}+fp64-fp16-denormals,{{[^"]*}}-fp32-denormals{{[^"]*}}" - -// AMDGCN-DENORM: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false" {{.*}} "target-features"="{{[^"]*}}+fp32-denormals,{{[^"]*}}+fp64-fp16-denormals{{[^"]*}}" - -// AMDGCN-FEATURE: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true" {{.*}} "target-features"="{{[^"]*}}+fp32-denormals,{{[^"]*}}-fp64-fp16-denormals{{[^"]*}}" -void f() {} Index: clang/test/CodeGenOpenCL/gfx9-fp32-denorms.cl =================================================================== --- clang/test/CodeGenOpenCL/gfx9-fp32-denorms.cl +++ /dev/null @@ -1,13 +0,0 @@ -// REQUIRES: amdgpu-registered-target - -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck --check-prefix=DEFAULT %s -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -target-feature +fp32-denormals %s | FileCheck --check-prefix=FEATURE_FP32_DENORMALS_ON %s -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -target-feature -fp32-denormals %s | FileCheck --check-prefix=FEATURE_FP32_DENORMALS_OFF %s -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -cl-denorms-are-zero %s | FileCheck --check-prefix=OPT_DENORMS_ARE_ZERO %s - -// DEFAULT: +fp32-denormals -// FEATURE_FP32_DENORMALS_ON: +fp32-denormals -// FEATURE_FP32_DENORMALS_OFF: -fp32-denormals -// OPT_DENORMS_ARE_ZERO: -fp32-denormals - -kernel void gfx9_fp32_denorms() {} Index: clang/test/Driver/cl-denorms-are-zero.cl =================================================================== --- /dev/null +++ clang/test/Driver/cl-denorms-are-zero.cl @@ -0,0 +1,20 @@ +// Slow FMAF and slow f32 denormals +// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=pitcairn %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s +// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=pitcairn %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s + +// Fast FMAF, but slow f32 denormals +// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s +// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s + +// Fast F32 denormals, but slow FMAF +// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=fiji %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s +// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=fiji %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s + +// Fast F32 denormals and fast FMAF +// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-DENORM %s +// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s + +// AMDGCN-FLUSH: "-fdenormal-fp-math-f32=preserve-sign" + +// This should be omitted and default to ieee +// AMDGCN-DENORM-NOT: "-fdenormal-fp-math-f32" Index: clang/test/Driver/cuda-flush-denormals-to-zero.cu =================================================================== --- /dev/null +++ clang/test/Driver/cuda-flush-denormals-to-zero.cu @@ -0,0 +1,13 @@ +// Checks that cuda compilation does the right thing when passed +// -fcuda-flush-denormals-to-zero. This should be translated to +// -fdenormal-fp-math-f32=preserve-sign + +// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_20 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_20 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s +// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_10 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_10 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s + +// CPUFTZ-NOT: -fdenormal-fp-math + +// FTZ: "-fdenormal-fp-math-f32=preserve-sign" +// NOFTZ: "-fdenormal-fp-math=ieee" Index: clang/test/Driver/denormal-fp-math.c =================================================================== --- clang/test/Driver/denormal-fp-math.c +++ clang/test/Driver/denormal-fp-math.c @@ -5,7 +5,7 @@ // RUN: %clang -### -target arm-unknown-linux-gnu -c %s -fdenormal-fp-math=ieee -fno-unsafe-math-optimizations -v 2>&1 | FileCheck -check-prefix=CHECK-NO-UNSAFE %s // RUN: not %clang -target arm-unknown-linux-gnu -c %s -fdenormal-fp-math=foo -v 2>&1 | FileCheck -check-prefix=CHECK-INVALID %s -// CHECK-IEEE: "-fdenormal-fp-math=ieee" +// CHECK-IEEE: -fdenormal-fp-math=ieee // CHECK-PS: "-fdenormal-fp-math=preserve-sign" // CHECK-PZ: "-fdenormal-fp-math=positive-zero" // CHECK-NO-UNSAFE-NOT: "-fdenormal-fp-math=ieee" Index: clang/test/Driver/opencl.cl =================================================================== --- clang/test/Driver/opencl.cl +++ clang/test/Driver/opencl.cl @@ -32,7 +32,10 @@ // CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math" // CHECK-MAD-ENABLE: "-cc1" {{.*}} "-cl-mad-enable" // CHECK-NO-SIGNED-ZEROS: "-cc1" {{.*}} "-cl-no-signed-zeros" -// CHECK-DENORMS-ARE-ZERO: "-cc1" {{.*}} "-cl-denorms-are-zero" + +// This is not forwarded +// CHECK-DENORMS-ARE-ZERO-NOT: "-cl-denorms-are-zero" + // CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt" // CHECK-UNIFORM-WG: "-cc1" {{.*}} "-cl-uniform-work-group-size" // CHECK-C99: error: invalid value 'c99' in '-cl-std=c99' Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -1818,6 +1818,30 @@ mode or that might alter the state of floating-point status flags that might otherwise be set or cleared by calling this function. LLVM will not introduce any new floating-point instructions that may trap. + +``"denormal-fp-math"`` + This indicates the denormal (subnormal) handling that may be assumed + for the default floating-point environment. This may be one of + ``"ieee"``, ``"preserve-sign"``, or ``"positive-zero"``. If this + is attribute is not specified, the default is ``"ieee"``. If the + mode is ``"preserve-sign"``, or ``"positive-zero"``, denormal + outputs may be flushed to zero by standard floating point + operations. It is not mandated that flushing to zero occurs, but if + a denormal output is flushed to zero, it must respect the sign + mode. Not all targets support all modes. While this indicates the + expected floating point mode the function will be executed with, + this does not make any attempt to ensure the mode is + consistent. User or platform code is expected to set the floating + point mode appropriately before function entry. + +``"denormal-fp-math-f32"`` + Same as ``"denormal-fp-math"``, but only controls the behavior of + the 32-bit float type (or vectors of 32-bit floats). If both are + are present, this overrides ``"denormal-fp-math"``. Not all targets + support separately setting the denormal mode per type, and no + attempt is made to diagnose unsupported uses. Currently this + attribute is respected by the AMDGPU and NVPTX backends. + ``"thunk"`` This attribute indicates that the function will delegate to some other function with a tail call. The prototype of a thunk should not be used for Index: llvm/lib/CodeGen/MachineFunction.cpp =================================================================== --- llvm/lib/CodeGen/MachineFunction.cpp +++ llvm/lib/CodeGen/MachineFunction.cpp @@ -271,6 +271,16 @@ } DenormalMode MachineFunction::getDenormalMode(const fltSemantics &FPType) const { + if (&FPType == &APFloat::IEEEsingle()) { + Attribute Attr = F.getFnAttribute("denormal-fp-math-f32"); + StringRef Val = Attr.getValueAsString(); + if (!Val.empty()) + return parseDenormalFPAttribute(Val); + + // If the f32 variant of the attribute isn't specified, try to use the + // generic one. + } + // TODO: Should probably avoid the connection to the IR and store directly // in the MachineFunction. Attribute Attr = F.getFnAttribute("denormal-fp-math"); Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -121,14 +121,10 @@ if (FtzEnabled.getNumOccurrences() > 0) { // If nvptx-f32ftz is used on the command-line, always honor it return FtzEnabled; - } else { - const Function &F = MF.getFunction(); - // Otherwise, check for an nvptx-f32ftz attribute on the function - if (F.hasFnAttribute("nvptx-f32ftz")) - return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true"; - else - return false; } + + return MF.getDenormalMode(APFloat::IEEEsingle()) == + DenormalMode::PreserveSign; } static bool IsPTXVectorType(MVT VT) { Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" @@ -1709,9 +1710,10 @@ // intrinsic, we don't have to look up any module metadata, as // FtzRequirementTy will be FTZ_Any.) if (Action.FtzRequirement != FTZ_Any) { - bool FtzEnabled = - II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() == - "true"; + StringRef Attr = II->getFunction() + ->getFnAttribute("denormal-fp-math-f32") + .getValueAsString(); + bool FtzEnabled = parseDenormalFPAttribute(Attr) != DenormalMode::IEEE; if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) return nullptr; Index: llvm/test/CodeGen/NVPTX/fast-math.ll =================================================================== --- llvm/test/CodeGen/NVPTX/fast-math.ll +++ llvm/test/CodeGen/NVPTX/fast-math.ll @@ -162,4 +162,4 @@ } attributes #0 = { "unsafe-fp-math" = "true" } -attributes #1 = { "nvptx-f32ftz" = "true" } +attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" } Index: llvm/test/CodeGen/NVPTX/math-intrins.ll =================================================================== --- llvm/test/CodeGen/NVPTX/math-intrins.ll +++ llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -289,4 +289,4 @@ } attributes #0 = { nounwind readnone } -attributes #1 = { "nvptx-f32ftz" = "true" } +attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" } Index: llvm/test/CodeGen/NVPTX/sqrt-approx.ll =================================================================== --- llvm/test/CodeGen/NVPTX/sqrt-approx.ll +++ llvm/test/CodeGen/NVPTX/sqrt-approx.ll @@ -146,5 +146,5 @@ } attributes #0 = { "unsafe-fp-math" = "true" } -attributes #1 = { "nvptx-f32ftz" = "true" } +attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" } attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" } Index: llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll =================================================================== --- llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll +++ llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll @@ -5,11 +5,11 @@ ; hackery: ; RUN: cat %s > %t.ftz -; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "true" }' >> %t.ftz +; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }' >> %t.ftz ; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ ; RUN: cat %s > %t.noftz -; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "false" }' >> %t.noftz +; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "ieee" }' >> %t.noftz ; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ ; We handle nvvm intrinsics with ftz variants as follows: