Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -310,6 +310,11 @@
 
   bool isSDNodeSourceOfDivergence(const SDNode *N,
     FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
+
+  bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                       unsigned MaxDepth = 5) const;
+
+  bool denormalsEnabledForType(EVT VT) const;
 };
 
 } // End namespace llvm
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6791,78 +6791,96 @@
   case ISD::FMINNAN:
     // TODO: What do these do for snans?
   default:
-    return false;
+    return DAG.getTarget().Options.NoNaNsFPMath;
   }
 }
 
-static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
-                            const GCNSubtarget *ST, unsigned MaxDepth=5) {
+bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                                       unsigned MaxDepth) const {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
 
   switch (Op.getOpcode()) {
-  default:
-    break;
-
   // These will flush denorms if required.
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
-  case ISD::FSQRT:
   case ISD::FCEIL:
   case ISD::FFLOOR:
   case ISD::FMA:
   case ISD::FMAD:
-
   case ISD::FCANONICALIZE:
+  case AMDGPUISD::FMUL_LEGACY:
     return true;
-
+  case ISD::FSQRT:
+  case ISD::FDIV:
+  case ISD::FREM:
+    return !hasFloatingPointExceptions();
   case ISD::FP_ROUND:
     return Op.getValueType().getScalarType() != MVT::f16 ||
-           ST->hasFP16Denormals();
+           Subtarget->hasFP16Denormals();
 
   case ISD::FP_EXTEND:
     return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
-           ST->hasFP16Denormals();
+           Subtarget->hasFP16Denormals();
 
   case ISD::FP16_TO_FP:
   case ISD::FP_TO_FP16:
-    return ST->hasFP16Denormals();
+    return Subtarget->hasFP16Denormals();
 
   // It can/will be lowered or combined as a bit operation.
   // Need to check their input recursively to handle.
   case ISD::FNEG:
   case ISD::FABS:
+  case ISD::FCOPYSIGN:
     return (MaxDepth > 0) &&
-           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
 
   case ISD::FSIN:
   case ISD::FCOS:
   case ISD::FSINCOS:
     return Op.getValueType().getScalarType() != MVT::f16;
 
-  // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
-  // For such targets need to check their input recursively.
   case ISD::FMINNUM:
-  case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
-
-    if (ST->supportsMinMaxDenormModes() &&
-        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
-        DAG.isKnownNeverNaN(Op.getOperand(1)))
+  case ISD::FMAXNUM: {
+    // Returns quieted sNaNs
+    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+    if (IsIEEEMode && Subtarget->supportsMinMaxDenormModes()) {
+      // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
+      // For such targets need to check their input recursively.
+      // FIXME: Shouldn't treat the generic operations different based on this.
       return true;
+    }
 
-    return (MaxDepth > 0) &&
-           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
-           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
+    // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
+    // needs to be quieted.
+    if (denormalsEnabledForType(Op.getValueType())) {
+      // No flushing required, so we just need to care about snans.
+      return isKnownNeverSNan(DAG, Op.getOperand(0)) &&
+             isKnownNeverSNan(DAG, Op.getOperand(1));
+    }
 
+    // Flushing or quieting may be necessary.
+    return (MaxDepth > 0) &&
+           isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+  }
   case ISD::ConstantFP: {
     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
-    return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
+    if (F.isNaN() && F.isSignaling())
+      return false;
+    return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
   }
+  case ISD::SELECT: {
+    return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
   }
-  return false;
+  default:
+    return denormalsEnabledForType(Op.getValueType()) &&
+           DAG.isKnownNeverNaN(Op);
+  }
+
+  llvm_unreachable("invalid operation");
 }
 
 // Constant fold canonicalize.
@@ -6874,22 +6892,7 @@
 
   if (!CFP) {
     SDValue N0 = N->getOperand(0);
-    EVT VT = N0.getValueType().getScalarType();
-    auto ST = getSubtarget();
-
-    if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
-         (VT == MVT::f64 && ST->hasFP64Denormals()) ||
-         (VT == MVT::f16 && ST->hasFP16Denormals())) &&
-        DAG.isKnownNeverNaN(N0))
-      return N0;
-
-    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
-
-    if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
-        isCanonicalized(DAG, N0, ST))
-      return N0;
-
-    return SDValue();
+    return isCanonicalized(DAG, N0) ? N0 : SDValue();
   }
 
   const APFloat &C = CFP->getValueAPF();
@@ -8506,3 +8509,16 @@
   }
   return false;
 }
+
+bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
+  switch (VT.getScalarType().getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    return Subtarget->hasFP32Denormals();
+  case MVT::f64:
+    return Subtarget->hasFP64Denormals();
+  case MVT::f16:
+    return Subtarget->hasFP16Denormals();
+  default:
+    return false;
+  }
+}
Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
===================================================================
--- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -1,7 +1,8 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,GCN-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
 
 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
 ; GCN-FLUSH:   v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
@@ -29,10 +30,26 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32:
+; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
+; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %load = load float, float addrspace(1)* %gep, align 4
+  %v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0)
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  store float %canonicalized, float addrspace(1)* %gep, align 4
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
 ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -45,8 +62,9 @@
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
 ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -59,8 +77,10 @@
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
 ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN-NOEXCEPT-NOT: v_mul
+; GCN-NOEXCEPT-NOT: v_max
+; GCN-EXCEPT: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[V]]
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -73,8 +93,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
 ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -87,8 +108,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
 ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -101,8 +123,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
 ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -116,8 +139,9 @@
 ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
 ; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
+; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}],
 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -132,8 +156,9 @@
 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
 ; GCN-FLUSH:  v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
 ; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -146,8 +171,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
 ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -161,8 +187,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
@@ -176,8 +203,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
 ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
@@ -211,8 +239,9 @@
 ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
 ; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
 ; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
@@ -239,8 +268,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
 ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -265,10 +295,28 @@
   ret void
 }
 
+; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
+; GCN-FLUSH:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
+; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
+; GCN-NOT: v_mul_
+; GCN-NOT: v_max_
+define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %load = load float, float addrspace(1)* %gep, align 4
+  %canon.load = tail call float @llvm.canonicalize.f32(float %load)
+  %copysign = call float @llvm.copysign.f32(float %canon.load, float %sign)
+  %v = tail call float @llvm.fabs.f32(float %load)
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  store float %canonicalized, float addrspace(1)* %gep, align 4
+  ret void
+}
+
 ; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
 ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -282,8 +330,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
 ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -296,8 +345,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
 ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -311,8 +361,9 @@
 ; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
 ; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
@@ -326,8 +377,9 @@
 ; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
 ; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
@@ -340,8 +392,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -351,8 +404,13 @@
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
-; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GFX9-NOT: v_max
+; GFX9-NOT: v_mul
+
+; VI-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+
 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -364,10 +422,28 @@
   ret void
 }
 
+; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan:
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
+; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+
+; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan(float addrspace(1)* %arg) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %load = load float, float addrspace(1)* %gep, align 4
+  %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  store float %canonicalized, float addrspace(1)* %gep, align 4
+  ret void
+}
+
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -382,9 +458,9 @@
 ; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
 
 ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
-; VI:   v_add_u32_e32 v{{[0-9]+}}
-; GFX9:	v_add_co_u32_e32 v{{[0-9]+}}
-; GCN:  {{flat|global}}_store_dword v[{{[0-9:]+}}]
+; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
+; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
+; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -397,10 +473,17 @@
 
 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
 ; GFX9:  v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; VI:    v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; VI:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+
+; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+
+; VI-DENORM: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], [[V0]], [[V0]]
+
+
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN:   {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
-; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -413,10 +496,15 @@
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
 ; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
-; VI:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; VI:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; VI-FLUSH:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; VI-FLUSH:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+
+; VI-DENORM: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], [[V0]], [[V0]]
+
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN:  {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
-; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -429,8 +517,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
 ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN-NOT: v_max
+; GCN-NOT: v_mul
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -444,8 +533,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
 ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
@@ -459,6 +549,7 @@
 
 ; GCN-LABEL: test_no_fold_canonicalize_fdiv_value_f32_no_ieee:
 ; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-NOEXCEPT-NOT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 define amdgpu_ps float @test_no_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) {
 entry:
   %v = fdiv float %arg0, 15.0
@@ -468,8 +559,9 @@
 
 ; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
 ; GCN-NEXT: ; return
-; GCN-NOT: 1.0
 define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
 entry:
   %v = fmul nnan float %arg, 15.0
@@ -495,7 +587,8 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
 ; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
+; GCN-NOT: v_mul_
+; GCN-NOT: v_max_
 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
@@ -508,8 +601,9 @@
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
 ; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
-; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
-; GCN-NOT: 1.0
+; GCN-NOT: v_mul
+; GCN-NOT: v_max
+; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
@@ -520,11 +614,76 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32:
+; GCN: v_add_f32
+; GCN: v_add_f32
+; GCN: v_cndmask_b32
+; GCN-NOT: v_mul_
+; GCN-NOT: v_max_
+define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspace(1)* %arg) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %load0 = load volatile float, float addrspace(1)* %gep, align 4
+  %load1 = load volatile float, float addrspace(1)* %gep, align 4
+  %load2 = load volatile i32, i32 addrspace(1)* undef, align 4
+  %v0 = fadd float %load0, 15.0
+  %v1 = fadd float %load1, 32.0
+  %cond = icmp eq i32 %load2, 0
+  %select = select i1 %cond, float %v0, float %v1
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %select)
+  store float %canonicalized, float addrspace(1)* %gep, align 4
+  ret void
+}
+
+; Need to quiet the nan with a separate instruction since it will be
+; passed through the minnum.
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
+; GFX9: v_min_f32_e32 v0, v0, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: ; return to shader
+
+; VI: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0
+; VI-DENORM: v_max_f32_e32 v0, v0, v0
+define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
+  %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  ret float %canonicalized
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode:
+; GFX9: v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; VI: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
+; VI-NEXT: s_setpc_b64
+define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
+  %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  ret float %canonicalized
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 {
+  %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  ret float %canonicalized
+}
+
 ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
 ; CHECK: .amd_amdgpu_isa
 
 declare float @llvm.canonicalize.f32(float) #0
+declare float @llvm.copysign.f32(float, float) #0
+declare float @llvm.amdgcn.fmul.legacy(float, float) #0
 declare double @llvm.canonicalize.f64(double) #0
 declare half @llvm.canonicalize.f16(half) #0
 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0