Skip to content

Commit dc2890a

Browse files
committedJul 13, 2017
[AMDGPU] fcaninicalize optimization for GFX9+
Since GFX9 supports denorm modes for v_min_f32/v_max_f32 that is possible to further optimize fcanonicalize and remove it if applied to min/max given their operands are known not to be an sNaN or that sNaNs are not supported. Additionally we can remove fcanonicalize if denorms are supported for the VT and we know that its argument is never a NaN. Differential Revision: https://reviews.llvm.org/D35335 llvm-svn: 307976
1 parent 890eedc commit dc2890a

File tree

3 files changed

+79
-14
lines changed

3 files changed

+79
-14
lines changed
 

‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

+4
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
359359
return FP64FP16Denormals;
360360
}
361361

362+
bool supportsMinMaxDenormModes() const {
363+
return getGeneration() >= AMDGPUSubtarget::GFX9;
364+
}
365+
362366
bool hasFPExceptions() const {
363367
return FPExceptions;
364368
}

‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+19-8
Original file line numberDiff line numberDiff line change
@@ -4624,8 +4624,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
46244624
return DAG.isKnownNeverNaN(Op);
46254625
}
46264626

4627-
static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
4628-
unsigned MaxDepth=5) {
4627+
static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
4628+
const SISubtarget *ST, unsigned MaxDepth=5) {
46294629
// If source is a result of another standard FP operation it is already in
46304630
// canonical form.
46314631

@@ -4663,7 +4663,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
46634663
case ISD::FNEG:
46644664
case ISD::FABS:
46654665
return (MaxDepth > 0) &&
4666-
isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
4666+
isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
46674667

46684668
case ISD::FSIN:
46694669
case ISD::FCOS:
@@ -4672,16 +4672,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
46724672

46734673
// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
46744674
// For such targets need to check their input recursively.
4675-
// TODO: on GFX9+ we could return true without checking provided no-nan
4676-
// mode, since canonicalization is also used to quiet sNaNs.
46774675
case ISD::FMINNUM:
46784676
case ISD::FMAXNUM:
46794677
case ISD::FMINNAN:
46804678
case ISD::FMAXNAN:
46814679

4680+
if (ST->supportsMinMaxDenormModes() &&
4681+
DAG.isKnownNeverNaN(Op.getOperand(0)) &&
4682+
DAG.isKnownNeverNaN(Op.getOperand(1)))
4683+
return true;
4684+
46824685
return (MaxDepth > 0) &&
4683-
isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
4684-
isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
4686+
isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
4687+
isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
46854688

46864689
case ISD::ConstantFP: {
46874690
auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
@@ -4700,11 +4703,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
47004703

47014704
if (!CFP) {
47024705
SDValue N0 = N->getOperand(0);
4706+
EVT VT = N0.getValueType().getScalarType();
4707+
auto ST = getSubtarget();
4708+
4709+
if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
4710+
(VT == MVT::f64 && ST->hasFP64Denormals()) ||
4711+
(VT == MVT::f16 && ST->hasFP16Denormals())) &&
4712+
DAG.isKnownNeverNaN(N0))
4713+
return N0;
47034714

47044715
bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
47054716

47064717
if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
4707-
isCanonicalized(N0, getSubtarget()))
4718+
isCanonicalized(DAG, N0, ST))
47084719
return N0;
47094720

47104721
return SDValue();

‎llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

+56-6
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
347347
}
348348

349349
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
350-
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
350+
; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
351+
; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
352+
; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]]
351353
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
352354
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
353355
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -388,9 +390,11 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
388390
}
389391

390392
; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
391-
; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
392-
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
393-
; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
393+
; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
394+
; VI: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
395+
; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
396+
; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
397+
; GFX9-NOT: 1.0
394398
define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
395399
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
396400
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -402,9 +406,11 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
402406
}
403407

404408
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
405-
; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
406-
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
409+
; GFX9: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
410+
; VI: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
411+
; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
407412
; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
413+
; GFX9-NOT: 1.0
408414
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
409415
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
410416
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -465,6 +471,49 @@ entry:
465471
ret float %canonicalized
466472
}
467473

474+
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
475+
; GFX9-DENORM: flat_load_dword [[V:v[0-9]+]],
476+
; GFX9-DENORM: flat_store_dword v[{{[0-9:]+}}], [[V]]
477+
; GFX9-DENORM-NOT: 1.0
478+
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
479+
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
480+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
481+
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
482+
%v = load float, float addrspace(1)* %gep, align 4
483+
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
484+
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
485+
store float %canonicalized, float addrspace(1)* %gep2, align 4
486+
ret void
487+
}
488+
489+
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
490+
; GCN: flat_load_dwordx2 [[V:v\[[0-9:]+\]]],
491+
; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
492+
; GCN-NOT: 1.0
493+
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
494+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
495+
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
496+
%v = load double, double addrspace(1)* %gep, align 8
497+
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
498+
%gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
499+
store double %canonicalized, double addrspace(1)* %gep2, align 8
500+
ret void
501+
}
502+
503+
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
504+
; GCN: flat_load_ushort [[V:v[0-9]+]],
505+
; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
506+
; GCN-NOT: 1.0
507+
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
508+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
509+
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
510+
%v = load half, half addrspace(1)* %gep, align 2
511+
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
512+
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
513+
store half %canonicalized, half addrspace(1)* %gep2, align 2
514+
ret void
515+
}
516+
468517
declare float @llvm.canonicalize.f32(float) #0
469518
declare double @llvm.canonicalize.f64(double) #0
470519
declare half @llvm.canonicalize.f16(half) #0
@@ -485,3 +534,4 @@ declare float @llvm.maxnum.f32(float, float) #0
485534
declare double @llvm.maxnum.f64(double, double) #0
486535

487536
attributes #0 = { nounwind readnone }
537+
attributes #1 = { "no-nans-fp-math"="true" }

0 commit comments

Comments
 (0)