Index: clang/include/clang/Basic/CodeGenOptions.h
===================================================================
--- clang/include/clang/Basic/CodeGenOptions.h
+++ clang/include/clang/Basic/CodeGenOptions.h
@@ -166,6 +166,9 @@
   /// The floating-point denormal mode to use.
   llvm::DenormalMode FPDenormalMode = llvm::DenormalMode::Invalid;
 
+  /// The floating-point subnormal mode to use, for float.
+  llvm::DenormalMode FP32DenormalMode = llvm::DenormalMode::Invalid;
+
   /// The float precision limit to use, if non-empty.
   std::string LimitFloatPrecision;
 
Index: clang/include/clang/Basic/CodeGenOptions.def
===================================================================
--- clang/include/clang/Basic/CodeGenOptions.def
+++ clang/include/clang/Basic/CodeGenOptions.def
@@ -149,7 +149,6 @@
 CODEGENOPT(ReciprocalMath    , 1, 0) ///< Allow FP divisions to be reassociated.
 CODEGENOPT(NoTrappingMath    , 1, 0) ///< Set when -fno-trapping-math is enabled.
 CODEGENOPT(NoNaNsFPMath      , 1, 0) ///< Assume FP arguments, results not NaN.
-CODEGENOPT(FlushDenorm       , 1, 0) ///< Allow FP denorm numbers to be flushed to zero
 CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt
 
 /// When false, this attempts to generate code as if the result of an
Index: clang/include/clang/Driver/CC1Options.td
===================================================================
--- clang/include/clang/Driver/CC1Options.td
+++ clang/include/clang/Driver/CC1Options.td
@@ -405,6 +405,9 @@
 def cfguard : Flag<["-"], "cfguard">,
     HelpText<"Emit Windows Control Flow Guard tables and checks">;
 
+def fdenormal_fp_math_f32_EQ : Joined<["-"], "fdenormal-fp-math-f32=">,
+   Group<f_Group>;
+
 //===----------------------------------------------------------------------===//
 // Dependency Output Options
 //===----------------------------------------------------------------------===//
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -523,7 +523,7 @@
   HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
 def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0,clc++,CLC++">;
-def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>, Flags<[CC1Option]>,
+def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>,
   HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
 def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
@@ -578,7 +578,7 @@
 def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
   HelpText<"Path to ptxas (used for compiling CUDA code)">;
 def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
-  Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
+  HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
 def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
 def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
   Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
Index: clang/include/clang/Driver/ToolChain.h
===================================================================
--- clang/include/clang/Driver/ToolChain.h
+++ clang/include/clang/Driver/ToolChain.h
@@ -16,7 +16,9 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Multilib.h"
 #include "clang/Driver/Types.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
@@ -606,6 +608,17 @@
   /// Returns true when it's possible to split LTO unit to use whole
   /// program devirtualization and CFI santiizers.
   virtual bool canSplitThinLTOUnit() const { return true; }
+
+  /// Returns the output denormal handling type in the default floating point
+  /// environment for the given \p FPType if given. Otherwise, the default
+  /// assumed mode for any floating point type.
+  virtual llvm::DenormalMode getDefaultDenormalModeForType(
+      const llvm::opt::ArgList &DriverArgs,
+      Action::OffloadKind DeviceOffloadKind,
+      const llvm::fltSemantics *FPType = nullptr) const {
+    // FIXME: This should be IEEE when default handling is fixed.
+    return llvm::DenormalMode::Invalid;
+  }
 };
 
 /// Set a ToolChain's effective triple. Reset it when the registration object
Index: clang/lib/Basic/Targets/AMDGPU.cpp
===================================================================
--- clang/lib/Basic/Targets/AMDGPU.cpp
+++ clang/lib/Basic/Targets/AMDGPU.cpp
@@ -238,7 +238,8 @@
   }
   if (!hasFP32Denormals)
     TargetOpts.Features.push_back(
-      (Twine(hasFastFMAF() && hasFullRateDenormalsF32() && !CGOpts.FlushDenorm
+      (Twine(hasFastFMAF() && hasFullRateDenormalsF32() &&
+             CGOpts.FP32DenormalMode == llvm::DenormalMode::IEEE
              ? '+' : '-') + Twine("fp32-denormals"))
             .str());
   // Always do not flush fp64 or fp16 denorms.
Index: clang/lib/CodeGen/CGCall.cpp
===================================================================
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -1741,9 +1741,15 @@
 
     if (CodeGenOpts.NullPointerIsValid)
       FuncAttrs.addAttribute("null-pointer-is-valid", "true");
+
+    // TODO: Omit attribute when the default is IEEE.
     if (CodeGenOpts.FPDenormalMode != llvm::DenormalMode::Invalid)
       FuncAttrs.addAttribute("denormal-fp-math",
-                             llvm::subnormalModeName(CodeGenOpts.FPDenormalMode));
+                             llvm::denormalModeName(CodeGenOpts.FPDenormalMode));
+    if (CodeGenOpts.FP32DenormalMode != llvm::DenormalMode::Invalid)
+      FuncAttrs.addAttribute(
+          "denormal-fp-math-f32",
+          llvm::denormalModeName(CodeGenOpts.FP32DenormalMode));
 
     FuncAttrs.addAttribute("no-trapping-math",
                            llvm::toStringRef(CodeGenOpts.NoTrappingMath));
@@ -1771,10 +1777,6 @@
         "correctly-rounded-divide-sqrt-fp-math",
         llvm::toStringRef(CodeGenOpts.CorrectlyRoundedDivSqrt));
 
-    if (getLangOpts().OpenCL)
-      FuncAttrs.addAttribute("denorms-are-zero",
-                             llvm::toStringRef(CodeGenOpts.FlushDenorm));
-
     // TODO: Reciprocal estimate codegen options should apply to instructions?
     const std::vector<std::string> &Recips = CodeGenOpts.Reciprocals;
     if (!Recips.empty())
@@ -1807,10 +1809,6 @@
   if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
     // Exceptions aren't supported in CUDA device code.
     FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
-
-    // Respect -fcuda-flush-denormals-to-zero.
-    if (CodeGenOpts.FlushDenorm)
-      FuncAttrs.addAttribute("nvptx-f32ftz", "true");
   }
 
   for (StringRef Attr : CodeGenOpts.DefaultFunctionAttrs) {
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -558,7 +558,8 @@
     // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
     // property.)
     getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
-                              CodeGenOpts.FlushDenorm ? 1 : 0);
+                              CodeGenOpts.FP32DenormalMode !=
+                                  llvm::DenormalMode::IEEE);
   }
 
   // Emit OpenCL specific module metadata: OpenCL/SPIR version.
Index: clang/lib/Driver/ToolChains/AMDGPU.h
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPU.h
+++ clang/lib/Driver/ToolChains/AMDGPU.h
@@ -66,6 +66,11 @@
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                              llvm::opt::ArgStringList &CC1Args,
                              Action::OffloadKind DeviceOffloadKind) const override;
+
+  llvm::DenormalMode getDefaultDenormalModeForType(
+      const llvm::opt::ArgList &DriverArgs,
+      Action::OffloadKind DeviceOffloadKind,
+      const llvm::fltSemantics *FPType = nullptr) const override;
 };
 
 } // end namespace toolchains
Index: clang/lib/Driver/ToolChains/AMDGPU.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -12,6 +12,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/TargetParser.h"
 
 using namespace clang::driver;
 using namespace clang::driver::tools;
@@ -102,6 +103,40 @@
   return DAL;
 }
 
+llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
+    const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
+    const llvm::fltSemantics *FPType) const {
+  // Denormals should always be enabled for f16 and f64.
+  if (!FPType || FPType != &llvm::APFloat::IEEEsingle())
+    return llvm::DenormalMode::IEEE;
+
+  if (DeviceOffloadKind == Action::OFK_Cuda) {
+    if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
+        DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
+                           options::OPT_fno_cuda_flush_denormals_to_zero,
+                           false))
+      return llvm::DenormalMode::PreserveSign;
+  }
+
+  const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ);
+  auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
+
+  // Default to enabling f32 denormals by default on subtargets where fma is
+  // fast with denormals
+
+  const unsigned ArchAttr = llvm::AMDGPU::getArchAttrAMDGCN(Kind);
+  const bool DefaultDenormsAreZeroForTarget =
+    (ArchAttr & llvm::AMDGPU::FEATURE_FAST_FMA_F32) &&
+    (ArchAttr & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32);
+
+  // TODO: There are way too many flags that change this. Do we need to check
+  // them all?
+  bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) ||
+             !DefaultDenormsAreZeroForTarget;
+  // Outputs are flushed to zero, preserving sign
+  return DAZ ? llvm::DenormalMode::PreserveSign : llvm::DenormalMode::IEEE;
+}
+
 void AMDGPUToolChain::addClangTargetOptions(
     const llvm::opt::ArgList &DriverArgs,
     llvm::opt::ArgStringList &CC1Args,
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -2221,7 +2221,8 @@
 
 static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                        bool OFastEnabled, const ArgList &Args,
-                                       ArgStringList &CmdArgs) {
+                                       ArgStringList &CmdArgs,
+                                       Action::OffloadKind DeviceOffloadKind) {
   // Handle various floating point optimization flags, mapping them to the
   // appropriate LLVM code generation flags. This is complicated by several
   // "umbrella" flags, so we do this by stepping through the flags incrementally
@@ -2243,10 +2244,15 @@
   StringRef FPModel = "";
   // -ffp-exception-behavior options: strict, maytrap, ignore
   StringRef FPExceptionBehavior = "";
-  StringRef DenormalFPMath = "";
+  const llvm::DenormalMode DefaultDenormalFPMath =
+      TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind);
+  llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
   StringRef FPContract = "";
   bool StrictFPModel = false;
 
+  llvm::DenormalMode DenormalFP32Math = TC.getDefaultDenormalModeForType(
+      Args, DeviceOffloadKind, &llvm::APFloat::IEEEsingle());
+
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
@@ -2368,7 +2374,19 @@
       break;
 
     case options::OPT_fdenormal_fp_math_EQ:
-      DenormalFPMath = A->getValue();
+      DenormalFPMath = llvm::parseDenormalFPAttribute(A->getValue());
+      if (DenormalFPMath == llvm::DenormalMode::Invalid) {
+        D.Diag(diag::err_drv_invalid_value)
+            << A->getAsString(Args) << A->getValue();
+      }
+      break;
+
+    case options::OPT_fdenormal_fp_math_f32_EQ:
+      DenormalFP32Math = llvm::parseDenormalFPAttribute(A->getValue());
+      if (DenormalFP32Math == llvm::DenormalMode::Invalid) {
+        D.Diag(diag::err_drv_invalid_value)
+            << A->getAsString(Args) << A->getValue();
+      }
       break;
 
     // Validate and pass through -ffp-contract option.
@@ -2438,7 +2456,7 @@
       TrappingMath = true;
       FPExceptionBehavior = "strict";
       // -fno_unsafe_math_optimizations restores default denormal handling
-      DenormalFPMath = "";
+      DenormalFPMath = DefaultDenormalFPMath;
       break;
 
     case options::OPT_Ofast:
@@ -2471,7 +2489,7 @@
       TrappingMath = false;
       RoundingFPMath = false;
       // -fno_fast_math restores default denormal and fpcontract handling
-      DenormalFPMath = "";
+      DenormalFPMath = DefaultDenormalFPMath;
       FPContract = "";
       break;
     }
@@ -2527,9 +2545,16 @@
   } else if (TrappingMathPresent)
     CmdArgs.push_back("-fno-trapping-math");
 
-  if (!DenormalFPMath.empty())
-    CmdArgs.push_back(
-        Args.MakeArgString("-fdenormal-fp-math=" + DenormalFPMath));
+  // TODO: Omit flag for the default IEEE instead
+  if (DenormalFPMath != llvm::DenormalMode::Invalid) {
+    CmdArgs.push_back(Args.MakeArgString(
+        "-fdenormal-fp-math=" + llvm::subnormalModeName(DenormalFPMath)));
+  }
+
+  if (DenormalFP32Math != llvm::DenormalMode::Invalid) {
+    CmdArgs.push_back(Args.MakeArgString(
+        "-fdenormal-fp-math-f32=" + llvm::subnormalModeName(DenormalFP32Math)));
+  }
 
   if (!FPContract.empty())
     CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + FPContract));
@@ -2744,6 +2769,8 @@
 }
 
 static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs) {
+  // cl-denorms-are-zero is not forwarded. It is translated into a generic flag
+  // for denormal flushing handling based on the target.
   const unsigned ForwardedArguments[] = {
       options::OPT_cl_opt_disable,
       options::OPT_cl_strict_aliasing,
@@ -2754,7 +2781,6 @@
       options::OPT_cl_fast_relaxed_math,
       options::OPT_cl_mad_enable,
       options::OPT_cl_no_signed_zeros,
-      options::OPT_cl_denorms_are_zero,
       options::OPT_cl_fp32_correctly_rounded_divide_sqrt,
       options::OPT_cl_uniform_work_group_size
   };
@@ -3932,7 +3958,7 @@
       CmdArgs.push_back("-mdisable-tail-calls");
 
     RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args,
-                               CmdArgs);
+                               CmdArgs, JA.getOffloadingDeviceKind());
 
     // Render ABI arguments
     switch (TC.getArch()) {
@@ -4231,7 +4257,8 @@
   if (Args.hasArg(options::OPT_fsplit_stack))
     CmdArgs.push_back("-split-stacks");
 
-  RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs);
+  RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs,
+                             JA.getOffloadingDeviceKind());
 
   if (Arg *A = Args.getLastArg(options::OPT_LongDouble_Group)) {
     if (TC.getArch() == llvm::Triple::x86 ||
Index: clang/lib/Driver/ToolChains/Cuda.h
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.h
+++ clang/lib/Driver/ToolChains/Cuda.h
@@ -149,6 +149,11 @@
                              llvm::opt::ArgStringList &CC1Args,
                              Action::OffloadKind DeviceOffloadKind) const override;
 
+  llvm::DenormalMode getDefaultDenormalModeForType(
+      const llvm::opt::ArgList &DriverArgs,
+      Action::OffloadKind DeviceOffloadKind,
+      const llvm::fltSemantics *FPType = nullptr) const override;
+
   // Never try to use the integrated assembler with CUDA; always fork out to
   // ptxas.
   bool useIntegratedAs() const override { return false; }
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include <system_error>
 
@@ -613,10 +614,6 @@
   if (DeviceOffloadingKind == Action::OFK_Cuda) {
     CC1Args.push_back("-fcuda-is-device");
 
-    if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
-                           options::OPT_fno_cuda_flush_denormals_to_zero, false))
-      CC1Args.push_back("-fcuda-flush-denormals-to-zero");
-
     if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
                            options::OPT_fno_cuda_approx_transcendentals, false))
       CC1Args.push_back("-fcuda-approx-transcendentals");
@@ -717,6 +714,21 @@
   }
 }
 
+llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
+    const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
+    const llvm::fltSemantics *FPType) const {
+  if (DeviceOffloadKind == Action::OFK_Cuda) {
+    if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
+        DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
+                           options::OPT_fno_cuda_flush_denormals_to_zero,
+                           false))
+      return llvm::DenormalMode::PreserveSign;
+  }
+
+  assert(DeviceOffloadKind != Action::OFK_Host);
+  return llvm::DenormalMode::IEEE;
+}
+
 bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
   const Option &O = A->getOption();
   return (O.matches(options::OPT_gN_Group) &&
Index: clang/lib/Driver/ToolChains/HIP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -280,10 +280,6 @@
   CC1Args.push_back(DriverArgs.MakeArgStringRef(GpuArch));
   CC1Args.push_back("-fcuda-is-device");
 
-  if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
-                         options::OPT_fno_cuda_flush_denormals_to_zero, false))
-    CC1Args.push_back("-fcuda-flush-denormals-to-zero");
-
   if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
                          options::OPT_fno_cuda_approx_transcendentals, false))
     CC1Args.push_back("-fcuda-approx-transcendentals");
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -910,9 +910,6 @@
                         Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
                         Args.hasArg(OPT_cl_fast_relaxed_math));
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
-  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
-                     (Args.hasArg(OPT_fcuda_is_device) &&
-                      Args.hasArg(OPT_fcuda_flush_denormals_to_zero));
   Opts.CorrectlyRoundedDivSqrt =
       Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.UniformWGSize =
@@ -1271,6 +1268,13 @@
       Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
   }
 
+  if (Arg *A = Args.getLastArg(OPT_fdenormal_fp_math_f32_EQ)) {
+    StringRef Val = A->getValue();
+    Opts.FP32DenormalMode = llvm::parseDenormalFPAttribute(Val);
+    if (Opts.FP32DenormalMode == llvm::DenormalMode::Invalid)
+      Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
+  }
+
   if (Arg *A = Args.getLastArg(OPT_fpcc_struct_return, OPT_freg_struct_return)) {
     if (A->getOption().matches(OPT_fpcc_struct_return)) {
       Opts.setStructReturnConvention(CodeGenOptions::SRCK_OnStack);
Index: clang/test/CodeGenCUDA/flush-denormals.cu
===================================================================
--- clang/test/CodeGenCUDA/flush-denormals.cu
+++ clang/test/CodeGenCUDA/flush-denormals.cu
@@ -1,23 +1,34 @@
 // RUN: %clang_cc1 -fcuda-is-device \
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
-// RUN:   FileCheck %s -check-prefix CHECK -check-prefix NOFTZ
-// RUN: %clang_cc1 -fcuda-is-device -fcuda-flush-denormals-to-zero \
+// RUN:   FileCheck -check-prefix=DEFAULT %s
+
+// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math-f32=ieee \
+// RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
+// RUN:   FileCheck -check-prefix=NOFTZ %s
+
+// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math-f32=preserve-sign \
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
-// RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
+// RUN:   FileCheck -check-prefix=FTZ %s
 
+// FIXME: Unspecified should default to ieee
 // RUN: %clang_cc1 -fcuda-is-device -x hip \
 // RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
-// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
-// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   FileCheck -check-prefix=AMDFTZ %s
+
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -fdenormal-fp-math-f32=ieee -emit-llvm -o - %s | \
+// RUN:   FileCheck -check-prefix=AMDNOFTZ %s
+
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fdenormal-fp-math-f32=preserve-sign \
 // RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
-// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+// RUN:   FileCheck -check-prefix=AMDFTZ %s
 
 #include "Inputs/cuda.h"
 
-// Checks that device function calls get emitted with the "ntpvx-f32ftz"
-// attribute set to "true" when we compile CUDA device code with
-// -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
-// or absence of -fcuda-flush-denormals-to-zero in a module flag.
+// Checks that device function calls get emitted with the "denormal-fp-math-f32"
+// attribute set when we compile CUDA device code with
+// -fdenormal-fp-math-f32. Further, check that we reflect the presence or
+// absence of -fcuda-flush-denormals-to-zero in a module flag.
 
 // AMDGCN targets always have +fp64-fp16-denormals.
 // AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals.
@@ -28,8 +39,13 @@
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 
-// FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
-// NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign"
+// NOFTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="ieee"
+
+
+// FIXME: This should be removed
+// DEFAULT-NOT: "denormal-fp-math-f32"
+
 // AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
 // AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
Index: clang/test/CodeGenCUDA/propagate-metadata.cu
===================================================================
--- clang/test/CodeGenCUDA/propagate-metadata.cu
+++ clang/test/CodeGenCUDA/propagate-metadata.cu
@@ -15,17 +15,17 @@
 // RUN:   %s -o %t.bc -triple nvptx-unknown-unknown
 
 // RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc -o - \
-// RUN:   -fno-trapping-math -fcuda-is-device -triple nvptx-unknown-unknown \
+// RUN:   -fno-trapping-math -fcuda-is-device -fdenormal-fp-math-f32=ieee -triple nvptx-unknown-unknown \
 // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ --check-prefix=NOFAST
 
 // RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc \
-// RUN:   -fno-trapping-math -fcuda-flush-denormals-to-zero -o - \
+// RUN:   -fno-trapping-math -fdenormal-fp-math-f32=preserve-sign -o - \
 // RUN:   -fcuda-is-device -triple nvptx-unknown-unknown \
 // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ \
 // RUN:   --check-prefix=NOFAST
 
 // RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc \
-// RUN:   -fno-trapping-math -fcuda-flush-denormals-to-zero -o - \
+// RUN:   -fno-trapping-math -fdenormal-fp-math-f32=preserve-sign -o - \
 // RUN:   -fcuda-is-device -menable-unsafe-fp-math -triple nvptx-unknown-unknown \
 // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
 
@@ -51,12 +51,18 @@
 // CHECK: define void @kernel() [[attr:#[0-9]+]]
 // CHECK: define internal void @lib_fn() [[attr]]
 
+// FIXME: These -NOT checks do not work as intended and do not check on the same
+// line.
+
 // Check the attribute list.
 // CHECK: attributes [[attr]] = {
-// CHECK: "no-trapping-math"="true"
 
-// FTZ-SAME: "nvptx-f32ftz"="true"
-// NOFTZ-NOT: "nvptx-f32ftz"="true"
+// FTZ-NOT: "denormal-fp-math"
+
+// FTZ-SAME: "denormal-fp-math-f32"="preserve-sign"
+// NOFTZ-SAME: "denormal-fp-math-f32"="ieee"
+
+// CHECK-SAME: "no-trapping-math"="true"
 
 // FAST-SAME: "unsafe-fp-math"="true"
 // NOFAST-NOT: "unsafe-fp-math"="true"
Index: clang/test/CodeGenOpenCL/amdgpu-features.cl
===================================================================
--- clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -14,13 +14,13 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX600 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX601 %s
 
-// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime"
+// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime,-fp32-denormals"
 // GFX700: "target-features"="+ci-insts,+flat-address-space,+fp64-fp16-denormals,-fp32-denormals"
 // GFX600: "target-features"="+fp64-fp16-denormals,-fp32-denormals"
 // GFX601: "target-features"="+fp64-fp16-denormals,-fp32-denormals"
Index: clang/test/CodeGenOpenCL/denorms-are-zero.cl
===================================================================
--- clang/test/CodeGenOpenCL/denorms-are-zero.cl
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - %s | FileCheck -check-prefix=DENORM-ZERO %s
-
-// Slow FMAF and slow f32 denormals
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// Fast FMAF, but slow f32 denormals
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu tahiti %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu tahiti %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// Fast F32 denormals, but slow FMAF
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// Fast F32 denormals and fast FMAF
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu gfx900 %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-DENORM %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu gfx900 %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// RUN: %clang_cc1 -emit-llvm -target-feature +fp32-denormals -target-feature -fp64-fp16-denormals -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FEATURE %s
-// RUN: %clang_cc1 -emit-llvm -target-feature +fp32-denormals -target-feature -fp64-fp16-denormals -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FEATURE %s
-
-
-
-// For all targets 'denorms-are-zero' attribute is set to 'true'
-// if '-cl-denorms-are-zero' was specified and  to 'false' otherwise.
-
-// CHECK-LABEL: define {{(dso_local )?}}void @f()
-// CHECK: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false"
-//
-// DENORM-ZERO-LABEL: define {{(dso_local )?}}void @f()
-// DENORM-ZERO: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true"
-
-// For amdgcn target cpu fiji, fp32 should be flushed since fiji does not support fp32 denormals, unless +fp32-denormals is
-// explicitly set. amdgcn target always do not flush fp64 denormals. The control for fp64 and fp16 denormals is the same.
-
-// AMDGCN-LABEL: define void @f()
-
-// AMDGCN-FLUSH: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false" {{.*}} "target-features"="{{[^"]*}}+fp64-fp16-denormals,{{[^"]*}}-fp32-denormals{{[^"]*}}"
-// AMDGCN-FLUSH-OPT: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true" {{.*}} "target-features"="{{[^"]*}}+fp64-fp16-denormals,{{[^"]*}}-fp32-denormals{{[^"]*}}"
-
-// AMDGCN-DENORM: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false" {{.*}} "target-features"="{{[^"]*}}+fp32-denormals,{{[^"]*}}+fp64-fp16-denormals{{[^"]*}}"
-
-// AMDGCN-FEATURE: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true" {{.*}} "target-features"="{{[^"]*}}+fp32-denormals,{{[^"]*}}-fp64-fp16-denormals{{[^"]*}}"
-void f() {}
Index: clang/test/CodeGenOpenCL/gfx9-fp32-denorms.cl
===================================================================
--- clang/test/CodeGenOpenCL/gfx9-fp32-denorms.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-// REQUIRES: amdgpu-registered-target
-
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck --check-prefix=DEFAULT %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -target-feature +fp32-denormals %s | FileCheck --check-prefix=FEATURE_FP32_DENORMALS_ON %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -target-feature -fp32-denormals %s | FileCheck --check-prefix=FEATURE_FP32_DENORMALS_OFF %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -cl-denorms-are-zero %s | FileCheck --check-prefix=OPT_DENORMS_ARE_ZERO %s
-
-// DEFAULT: +fp32-denormals
-// FEATURE_FP32_DENORMALS_ON: +fp32-denormals
-// FEATURE_FP32_DENORMALS_OFF: -fp32-denormals
-// OPT_DENORMS_ARE_ZERO: -fp32-denormals
-
-kernel void gfx9_fp32_denorms() {}
Index: clang/test/Driver/cl-denorms-are-zero.cl
===================================================================
--- /dev/null
+++ clang/test/Driver/cl-denorms-are-zero.cl
@@ -0,0 +1,20 @@
+// Slow FMAF and slow f32 denormals
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=pitcairn %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=pitcairn %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// Fast FMAF, but slow f32 denormals
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// Fast F32 denormals, but slow FMAF
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=fiji %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=fiji %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// Fast F32 denormals and fast FMAF
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-DENORM %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// AMDGCN-FLUSH: "-fdenormal-fp-math-f32=preserve-sign"
+
+// This should be omitted and default to ieee
+// AMDGCN-DENORM-NOT: "-fdenormal-fp-math-f32"
Index: clang/test/Driver/cuda-flush-denormals-to-zero.cu
===================================================================
--- /dev/null
+++ clang/test/Driver/cuda-flush-denormals-to-zero.cu
@@ -0,0 +1,13 @@
+// Checks that cuda compilation does the right thing when passed
+// -fcuda-flush-denormals-to-zero. This should be translated to
+// -fdenormal-fp-math-f32=preserve-sign
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_20 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_20 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_10 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_10 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
+
+// CPUFTZ-NOT: -fdenormal-fp-math
+
+// FTZ: "-fdenormal-fp-math-f32=preserve-sign"
+// NOFTZ: "-fdenormal-fp-math=ieee"
Index: clang/test/Driver/denormal-fp-math.c
===================================================================
--- clang/test/Driver/denormal-fp-math.c
+++ clang/test/Driver/denormal-fp-math.c
@@ -5,7 +5,7 @@
 // RUN: %clang -### -target arm-unknown-linux-gnu -c %s -fdenormal-fp-math=ieee -fno-unsafe-math-optimizations -v 2>&1 | FileCheck -check-prefix=CHECK-NO-UNSAFE %s
 // RUN: not %clang -target arm-unknown-linux-gnu -c %s -fdenormal-fp-math=foo -v 2>&1 | FileCheck -check-prefix=CHECK-INVALID %s
 
-// CHECK-IEEE: "-fdenormal-fp-math=ieee"
+// CHECK-IEEE: -fdenormal-fp-math=ieee
 // CHECK-PS: "-fdenormal-fp-math=preserve-sign"
 // CHECK-PZ: "-fdenormal-fp-math=positive-zero"
 // CHECK-NO-UNSAFE-NOT: "-fdenormal-fp-math=ieee"
Index: clang/test/Driver/opencl.cl
===================================================================
--- clang/test/Driver/opencl.cl
+++ clang/test/Driver/opencl.cl
@@ -32,7 +32,10 @@
 // CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math"
 // CHECK-MAD-ENABLE: "-cc1" {{.*}} "-cl-mad-enable"
 // CHECK-NO-SIGNED-ZEROS: "-cc1" {{.*}} "-cl-no-signed-zeros"
-// CHECK-DENORMS-ARE-ZERO: "-cc1" {{.*}} "-cl-denorms-are-zero"
+
+// This is not forwarded
+// CHECK-DENORMS-ARE-ZERO-NOT: "-cl-denorms-are-zero"
+
 // CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt"
 // CHECK-UNIFORM-WG: "-cc1" {{.*}} "-cl-uniform-work-group-size"
 // CHECK-C99: error: invalid value 'c99' in '-cl-std=c99'
Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -1818,6 +1818,22 @@
     mode or that might alter the state of floating-point status flags that
     might otherwise be set or cleared by calling this function. LLVM will
     not introduce any new floating-point instructions that may trap.
+
+``"denormal-fp-math"``
+  This indicates the subnormal handling that may be assumed for the
+   default floating-point environment. This may be one of ``"ieee"``,
+   ``"preserve-sign"``, or ``"positive-zero"``.  If this is attribute
+   is not specified, the default is ``"ieee"``. If the mode is
+   ``"preserve-sign"``, or ``"positive-zero"``, subnormal outputs may
+   be flushed to zero by standard floating point operations. It is not
+   mandated that flushing to zero occurs, but if a subnormal output is
+   flushed to zero, it must respect the sign mode. Not all targets
+   support all modes.
+
+``"denormal-fp-math-f32"``
+   Same as ``"denorm-fp-math-f32"``, except for float types. If both
+   are present, this overrides ``"denorm-fp-math"``.
+
 ``"thunk"``
     This attribute indicates that the function will delegate to some other
     function with a tail call. The prototype of a thunk should not be used for
Index: llvm/lib/CodeGen/MachineFunction.cpp
===================================================================
--- llvm/lib/CodeGen/MachineFunction.cpp
+++ llvm/lib/CodeGen/MachineFunction.cpp
@@ -271,6 +271,16 @@
 }
 
 DenormalMode MachineFunction::getDenormalMode(const fltSemantics &FPType) const {
+  if (&FPType == &APFloat::IEEEsingle()) {
+    Attribute Attr = F.getFnAttribute("denormal-fp-math-f32");
+    StringRef Val = Attr.getValueAsString();
+    if (!Val.empty())
+      return parseDenormalFPAttribute(Val);
+
+    // If the f32 variant of the attribute isn't specified, try to use the
+    // generic one.
+  }
+
   // TODO: Should probably avoid the connection to the IR and store directly
   // in the MachineFunction.
   Attribute Attr = F.getFnAttribute("denormal-fp-math");
Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -120,14 +120,10 @@
   if (FtzEnabled.getNumOccurrences() > 0) {
     // If nvptx-f32ftz is used on the command-line, always honor it
     return FtzEnabled;
-  } else {
-    const Function &F = MF.getFunction();
-    // Otherwise, check for an nvptx-f32ftz attribute on the function
-    if (F.hasFnAttribute("nvptx-f32ftz"))
-      return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
-    else
-      return false;
   }
+
+  return MF.getDenormalMode(APFloat::IEEEsingle()) ==
+         DenormalMode::PreserveSign;
 }
 
 static bool IsPTXVectorType(MVT VT) {
Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -1703,9 +1704,10 @@
   // intrinsic, we don't have to look up any module metadata, as
   // FtzRequirementTy will be FTZ_Any.)
   if (Action.FtzRequirement != FTZ_Any) {
-    bool FtzEnabled =
-        II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
-        "true";
+    StringRef Attr = II->getFunction()
+                         ->getFnAttribute("denormal-fp-math-f32")
+                         .getValueAsString();
+    bool FtzEnabled = parseDenormalFPAttribute(Attr) != DenormalMode::IEEE;
 
     if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
       return nullptr;
Index: llvm/test/CodeGen/NVPTX/fast-math.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/fast-math.ll
+++ llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -162,4 +162,4 @@
 }
 
 attributes #0 = { "unsafe-fp-math" = "true" }
-attributes #1 = { "nvptx-f32ftz" = "true" }
+attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
Index: llvm/test/CodeGen/NVPTX/math-intrins.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -289,4 +289,4 @@
 }
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { "nvptx-f32ftz" = "true" }
+attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
Index: llvm/test/CodeGen/NVPTX/sqrt-approx.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -146,5 +146,5 @@
 }
 
 attributes #0 = { "unsafe-fp-math" = "true" }
-attributes #1 = { "nvptx-f32ftz" = "true" }
+attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
 attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" }
Index: llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
===================================================================
--- llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
+++ llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@@ -5,11 +5,11 @@
 ; hackery:
 
 ; RUN: cat %s > %t.ftz
-; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "true" }' >> %t.ftz
+; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }' >> %t.ftz
 ; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ
 
 ; RUN: cat %s > %t.noftz
-; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "false" }' >> %t.noftz
+; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "ieee" }' >> %t.noftz
 ; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ
 
 ; We handle nvvm intrinsics with ftz variants as follows: