[AMDGPU] fcaninicalize optimization for GFX9+

rampitec · rampitec · commit dc2890a88722 · 2017-07-13T23:59:15.000Z
Since GFX9 supports denorm modes for v_min_f32/v_max_f32 that is possible to further optimize fcanonicalize and remove it if applied to min/max given their operands are known not to be an sNaN or that sNaNs are not supported. Additionally we can remove fcanonicalize if denorms are supported for the VT and we know that its argument is never a NaN. Differential Revision: https://reviews.llvm.org/D35335 llvm-svn: 307976
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -359,6 +359,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return FP64FP16Denormals;
   }
 
+  bool supportsMinMaxDenormModes() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
   bool hasFPExceptions() const {
     return FPExceptions;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4624,8 +4624,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   return DAG.isKnownNeverNaN(Op);
 }
 
-static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
-                            unsigned MaxDepth=5) {
+static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                            const SISubtarget *ST, unsigned MaxDepth=5) {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
 
@@ -4663,7 +4663,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
   case ISD::FNEG:
   case ISD::FABS:
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
 
   case ISD::FSIN:
   case ISD::FCOS:
@@ -4672,16 +4672,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
 
   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
   // For such targets need to check their input recursively.
-  // TODO: on GFX9+ we could return true without checking provided no-nan
-  // mode, since canonicalization is also used to quiet sNaNs.
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
 
+    if (ST->supportsMinMaxDenormModes() &&
+        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
+        DAG.isKnownNeverNaN(Op.getOperand(1)))
+      return true;
+
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
-           isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
 
   case ISD::ConstantFP: {
     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
@@ -4700,11 +4703,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
 
   if (!CFP) {
     SDValue N0 = N->getOperand(0);
+    EVT VT = N0.getValueType().getScalarType();
+    auto ST = getSubtarget();
+
+    if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
+         (VT == MVT::f64 && ST->hasFP64Denormals()) ||
+         (VT == MVT::f16 && ST->hasFP16Denormals())) &&
+        DAG.isKnownNeverNaN(N0))
+      return N0;
 
     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
 
     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
-        isCanonicalized(N0, getSubtarget()))
+        isCanonicalized(DAG, N0, ST))
       return N0;
 
     return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -347,7 +347,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -388,9 +390,11 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN:  v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; GCN:  v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
-; GCN:  flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9:  v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI:    v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI:    v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN:   flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -402,9 +406,11 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
 }
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
-; GCN:  v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; GCN:  v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GFX9:  v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; VI:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; VI:    v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
 ; GCN:  flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -465,6 +471,49 @@ entry:
   ret float %canonicalized
 }
 
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
+; GFX9-DENORM: flat_load_dword [[V:v[0-9]+]],
+; GFX9-DENORM: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-DENORM-NOT: 1.0
+; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %v = load float, float addrspace(1)* %gep, align 4
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+  store float %canonicalized, float addrspace(1)* %gep2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
+; GCN: flat_load_dwordx2 [[V:v\[[0-9:]+\]]],
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+  %v = load double, double addrspace(1)* %gep, align 8
+  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+  %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
+  store double %canonicalized, double addrspace(1)* %gep2, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
+; GCN: flat_load_ushort [[V:v[0-9]+]],
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+  %v = load half, half addrspace(1)* %gep, align 2
+  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
+  store half %canonicalized, half addrspace(1)* %gep2, align 2
+  ret void
+}
+
 declare float @llvm.canonicalize.f32(float) #0
 declare double @llvm.canonicalize.f64(double) #0
 declare half @llvm.canonicalize.f16(half) #0
@@ -485,3 +534,4 @@ declare float @llvm.maxnum.f32(float, float) #0
 declare double @llvm.maxnum.f64(double, double) #0
 
 attributes #0 = { nounwind readnone }
+attributes #1 = { "no-nans-fp-math"="true" }

Original file line number	Diff line number	Diff line change
`@@ -359,6 +359,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {`
`359`	`359`	`return FP64FP16Denormals;`
`360`	`360`	`}`
`361`	`361`
	`362`	`+ bool supportsMinMaxDenormModes() const {`
	`363`	`+ return getGeneration() >= AMDGPUSubtarget::GFX9;`
	`364`	`+ }`
	`365`	`+`
`362`	`366`	`bool hasFPExceptions() const {`
`363`	`367`	`return FPExceptions;`
`364`	`368`	`}`