Index: clang/include/clang/Driver/ToolChain.h =================================================================== --- clang/include/clang/Driver/ToolChain.h +++ clang/include/clang/Driver/ToolChain.h @@ -636,8 +636,7 @@ /// environment for the given \p FPType if given. Otherwise, the default /// assumed mode for any floating point type. virtual llvm::DenormalMode getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, - Action::OffloadKind DeviceOffloadKind, + const llvm::opt::ArgList &DriverArgs, const JobAction &JA, const llvm::fltSemantics *FPType = nullptr) const { return llvm::DenormalMode::getIEEE(); } Index: clang/lib/Driver/ToolChains/AMDGPU.h =================================================================== --- clang/lib/Driver/ToolChains/AMDGPU.h +++ clang/lib/Driver/ToolChains/AMDGPU.h @@ -214,8 +214,7 @@ static bool getDefaultDenormsAreZeroForTarget(llvm::AMDGPU::GPUKind GPUKind); llvm::DenormalMode getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, - Action::OffloadKind DeviceOffloadKind, + const llvm::opt::ArgList &DriverArgs, const JobAction &JA, const llvm::fltSemantics *FPType = nullptr) const override; }; Index: clang/lib/Driver/ToolChains/AMDGPU.cpp =================================================================== --- clang/lib/Driver/ToolChains/AMDGPU.cpp +++ clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -273,18 +273,22 @@ } llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind, + const llvm::opt::ArgList &DriverArgs, const JobAction &JA, const llvm::fltSemantics *FPType) const { // Denormals should always be enabled for f16 and f64. if (!FPType || FPType != &llvm::APFloat::IEEEsingle()) return llvm::DenormalMode::getIEEE(); - if (DeviceOffloadKind == Action::OFK_Cuda) { + if (JA.getOffloadingDeviceKind() == Action::OFK_HIP || + JA.getOffloadingDeviceKind() == Action::OFK_Cuda) { + auto Kind = llvm::AMDGPU::parseArchAMDGCN(JA.getOffloadingArch()); if (FPType && FPType == &llvm::APFloat::IEEEsingle() && DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, options::OPT_fno_cuda_flush_denormals_to_zero, - false)) + getDefaultDenormsAreZeroForTarget(Kind))) return llvm::DenormalMode::getPreserveSign(); + + return llvm::DenormalMode::getIEEE(); } const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ); @@ -294,7 +298,9 @@ // them all? bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) || getDefaultDenormsAreZeroForTarget(Kind); - // Outputs are flushed to zero, preserving sign + + // Outputs are flushed to zero (FTZ), preserving sign. Denormal inputs are + // also implicit treated as zero (DAZ). return DAZ ? llvm::DenormalMode::getPreserveSign() : llvm::DenormalMode::getIEEE(); } Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -2510,7 +2510,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, bool OFastEnabled, const ArgList &Args, ArgStringList &CmdArgs, - Action::OffloadKind DeviceOffloadKind) { + const JobAction &JA) { // Handle various floating point optimization flags, mapping them to the // appropriate LLVM code generation flags. This is complicated by several // "umbrella" flags, so we do this by stepping through the flags incrementally @@ -2533,10 +2533,9 @@ // -ffp-exception-behavior options: strict, maytrap, ignore StringRef FPExceptionBehavior = ""; const llvm::DenormalMode DefaultDenormalFPMath = - TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind); + TC.getDefaultDenormalModeForType(Args, JA); const llvm::DenormalMode DefaultDenormalFP32Math = - TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind, - &llvm::APFloat::IEEEsingle()); + TC.getDefaultDenormalModeForType(Args, JA, &llvm::APFloat::IEEEsingle()); llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath; llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math; @@ -4295,7 +4294,7 @@ CmdArgs.push_back("-mdisable-tail-calls"); RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args, - CmdArgs, JA.getOffloadingDeviceKind()); + CmdArgs, JA); // Render ABI arguments switch (TC.getArch()) { @@ -4618,8 +4617,7 @@ if (Args.hasArg(options::OPT_fsplit_stack)) CmdArgs.push_back("-split-stacks"); - RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, - JA.getOffloadingDeviceKind()); + RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, JA); if (Arg *A = Args.getLastArg(options::OPT_mdouble_EQ)) { if (TC.getArch() == llvm::Triple::avr) Index: clang/lib/Driver/ToolChains/Cuda.h =================================================================== --- clang/lib/Driver/ToolChains/Cuda.h +++ clang/lib/Driver/ToolChains/Cuda.h @@ -156,8 +156,7 @@ Action::OffloadKind DeviceOffloadKind) const override; llvm::DenormalMode getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, - Action::OffloadKind DeviceOffloadKind, + const llvm::opt::ArgList &DriverArgs, const JobAction &JA, const llvm::fltSemantics *FPType = nullptr) const override; // Never try to use the integrated assembler with CUDA; always fork out to Index: clang/lib/Driver/ToolChains/Cuda.cpp =================================================================== --- clang/lib/Driver/ToolChains/Cuda.cpp +++ clang/lib/Driver/ToolChains/Cuda.cpp @@ -721,9 +721,9 @@ } llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind, + const llvm::opt::ArgList &DriverArgs, const JobAction &JA, const llvm::fltSemantics *FPType) const { - if (DeviceOffloadKind == Action::OFK_Cuda) { + if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) { if (FPType && FPType == &llvm::APFloat::IEEEsingle() && DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, options::OPT_fno_cuda_flush_denormals_to_zero, @@ -731,7 +731,7 @@ return llvm::DenormalMode::getPreserveSign(); } - assert(DeviceOffloadKind != Action::OFK_Host); + assert(JA.getOffloadingDeviceKind() != Action::OFK_Host); return llvm::DenormalMode::getIEEE(); } Index: clang/lib/Driver/ToolChains/Linux.h =================================================================== --- clang/lib/Driver/ToolChains/Linux.h +++ clang/lib/Driver/ToolChains/Linux.h @@ -49,9 +49,8 @@ std::vector ExtraOpts; llvm::DenormalMode getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, - Action::OffloadKind DeviceOffloadKind, - const llvm::fltSemantics *FPType = nullptr) const override; + const llvm::opt::ArgList &DriverArgs, const JobAction &JA, + const llvm::fltSemantics *FPType = nullptr) const override; protected: Tool *buildAssembler() const override; Index: clang/lib/Driver/ToolChains/Linux.cpp =================================================================== --- clang/lib/Driver/ToolChains/Linux.cpp +++ clang/lib/Driver/ToolChains/Linux.cpp @@ -988,10 +988,10 @@ ToolChain::addProfileRTLibs(Args, CmdArgs); } -llvm::DenormalMode Linux::getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, - Action::OffloadKind DeviceOffloadKind, - const llvm::fltSemantics *FPType) const { +llvm::DenormalMode +Linux::getDefaultDenormalModeForType(const llvm::opt::ArgList &DriverArgs, + const JobAction &JA, + const llvm::fltSemantics *FPType) const { switch (getTriple().getArch()) { case llvm::Triple::x86: case llvm::Triple::x86_64: { Index: clang/lib/Driver/ToolChains/PS4CPU.h =================================================================== --- clang/lib/Driver/ToolChains/PS4CPU.h +++ clang/lib/Driver/ToolChains/PS4CPU.h @@ -94,9 +94,8 @@ Action::OffloadKind DeviceOffloadingKind) const override; llvm::DenormalMode getDefaultDenormalModeForType( - const llvm::opt::ArgList &DriverArgs, - Action::OffloadKind DeviceOffloadKind, - const llvm::fltSemantics *FPType) const override { + const llvm::opt::ArgList &DriverArgs, const JobAction &JA, + const llvm::fltSemantics *FPType) const override { // DAZ and FTZ are on by default. return llvm::DenormalMode::getPreserveSign(); } Index: clang/test/Driver/cuda-flush-denormals-to-zero.cu =================================================================== --- clang/test/Driver/cuda-flush-denormals-to-zero.cu +++ clang/test/Driver/cuda-flush-denormals-to-zero.cu @@ -7,16 +7,28 @@ // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s -// Test explicit argument. +// Test explicit argument, with CUDA offload kind // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s + +// Test explicit argument, with HIP offload kind +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s + // RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s // RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s -// Test the default changing with no argument based on the subtarget. +// Test the default changing with no argument based on the subtarget in HIP mode // RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s // RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s + +// Test multiple offload archs with different defaults. +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=MIXED-DEFAULT-MODE %s +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell -fcuda-flush-denormals-to-zero --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZX2 %s +// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell -fno-cuda-flush-denormals-to-zero --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s + + // CPUFTZ-NOT: -fdenormal-fp-math // FTZ-NOT: -fdenormal-fp-math-f32= @@ -25,3 +37,13 @@ // The default of ieee is omitted // NOFTZ-NOT: "-fdenormal-fp-math" // NOFTZ-NOT: "-fdenormal-fp-math-f32" + +// MIXED-DEFAULT-MODE-NOT: -denormal-fp-math +// MIXED-DEFAULT-MODE: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign" +// MIXED-DEFAULT-MODE-SAME: "-target-cpu" "gfx803" +// MIXED-DEFAULT-MODE-NOT: -denormal-fp-math + +// FTZX2: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign" +// FTZX2-SAME: "-target-cpu" "gfx803" +// FTZX2: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign" +// FTZX2-SAME: "-target-cpu" "gfx900"