Index: clang/lib/Driver/ToolChains/AMDGPU.h =================================================================== --- clang/lib/Driver/ToolChains/AMDGPU.h +++ clang/lib/Driver/ToolChains/AMDGPU.h @@ -13,6 +13,8 @@ #include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" +#include "llvm/Support/TargetParser.h" + #include namespace clang { @@ -67,6 +69,10 @@ llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadKind) const override; + /// Return whether denormals should be flushed, and treated as 0 by default + /// for the subtarget. + static bool getDefaultDenormsAreZeroForTarget(llvm::AMDGPU::GPUKind GPUKind); + llvm::DenormalMode getDefaultDenormalModeForType( const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind, Index: clang/lib/Driver/ToolChains/AMDGPU.cpp =================================================================== --- clang/lib/Driver/ToolChains/AMDGPU.cpp +++ clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -103,6 +103,19 @@ return DAL; } +bool AMDGPUToolChain::getDefaultDenormsAreZeroForTarget( + llvm::AMDGPU::GPUKind Kind) { + const unsigned ArchAttr = llvm::AMDGPU::getArchAttrAMDGCN(Kind); + + // Default to enabling f32 denormals by default on subtargets where fma is + // fast with denormals + const bool DefaultDenormsAreZeroForTarget = + (ArchAttr & llvm::AMDGPU::FEATURE_FAST_FMA_F32) && + (ArchAttr & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32); + + return DefaultDenormsAreZeroForTarget; +} + llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType( const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind, const llvm::fltSemantics *FPType) const { @@ -121,18 +134,10 @@ const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ); auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch); - // Default to enabling f32 denormals by default on subtargets where fma is - // fast with denormals - - const unsigned ArchAttr = llvm::AMDGPU::getArchAttrAMDGCN(Kind); - const bool DefaultDenormsAreZeroForTarget = - (ArchAttr & llvm::AMDGPU::FEATURE_FAST_FMA_F32) && - (ArchAttr & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32); - // TODO: There are way too many flags that change this. Do we need to check // them all? bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) || - !DefaultDenormsAreZeroForTarget; + !getDefaultDenormsAreZeroForTarget(Kind); // Outputs are flushed to zero, preserving sign return DAZ ? llvm::DenormalMode::getPreserveSign() : llvm::DenormalMode::getIEEE(); Index: clang/lib/Driver/ToolChains/HIP.h =================================================================== --- clang/lib/Driver/ToolChains/HIP.h +++ clang/lib/Driver/ToolChains/HIP.h @@ -115,6 +115,11 @@ unsigned GetDefaultDwarfVersion() const override { return 4; } + llvm::DenormalMode getDefaultDenormalModeForType( + const llvm::opt::ArgList &DriverArgs, + Action::OffloadKind DeviceOffloadKind, + const llvm::fltSemantics *FPType = nullptr) const override; + const ToolChain &HostTC; protected: Index: clang/lib/Driver/ToolChains/HIP.cpp =================================================================== --- clang/lib/Driver/ToolChains/HIP.cpp +++ clang/lib/Driver/ToolChains/HIP.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "HIP.h" +#include "AMDGPU.h" #include "CommonArgs.h" #include "InputInfo.h" #include "clang/Basic/Cuda.h" @@ -16,6 +17,7 @@ #include "clang/Driver/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/TargetParser.h" using namespace clang::driver; using namespace clang::driver::toolchains; @@ -272,6 +274,34 @@ getProgramPaths().push_back(getDriver().Dir); } +// FIXME: Duplicated in AMDGPUToolChain +llvm::DenormalMode HIPToolChain::getDefaultDenormalModeForType( + const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind, + const llvm::fltSemantics *FPType) const { + // Denormals should always be enabled for f16 and f64. + if (!FPType || FPType != &llvm::APFloat::IEEEsingle()) + return llvm::DenormalMode::getIEEE(); + + if (DeviceOffloadKind == Action::OFK_Cuda) { + if (FPType && FPType == &llvm::APFloat::IEEEsingle() && + DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, + options::OPT_fno_cuda_flush_denormals_to_zero, + false)) + return llvm::DenormalMode::getPreserveSign(); + } + + const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ); + auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch); + + // TODO: There are way too many flags that change this. Do we need to check + // them all? + bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) || + !AMDGPUToolChain::getDefaultDenormsAreZeroForTarget(Kind); + // Outputs are flushed to zero, preserving sign + return DAZ ? llvm::DenormalMode::getPreserveSign() : + llvm::DenormalMode::getIEEE(); +} + void HIPToolChain::addClangTargetOptions( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Index: clang/test/Driver/cuda-flush-denormals-to-zero.cu =================================================================== --- clang/test/Driver/cuda-flush-denormals-to-zero.cu +++ clang/test/Driver/cuda-flush-denormals-to-zero.cu @@ -7,6 +7,16 @@ // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s +// Test explicit argument. +// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s + +// Test the default changing with no argument based on the subtarget. +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s + // CPUFTZ-NOT: -fdenormal-fp-math // FTZ-NOT: -fdenormal-fp-math-f32=