Skip to content

Commit 9d7b1c9

Browse files
committedJul 6, 2017
[AMDGPU] Always use rcp + mul with fast math
Regardless of relaxation options such as -cl-fast-relaxed-math we are producing rather long code for fdiv via amdgcn_fdiv_fast intrinsic. This intrinsic is used to replace fdiv with 2.5ulp metadata and does not handle denormals, thus believed to be fast. An fdiv instruction can also have fast math flag either by itself or together with fpmath metadata. Clang used with a relaxation flag always produces both metadata and fast flag: %div = fdiv fast float %v, %0, !fpmath !12 !12 = !{float 2.500000e+00} Current implementation ignores fast flag and favors metadata. An instruction with just fast flag would be lowered to a fastest rcp + mul, but that never happen on practice because of described mutual clang and BE behavior. This change allows an "fdiv fast" to be always lowered as rcp + mul. Differential Revision: https://reviews.llvm.org/D34844 llvm-svn: 307308
1 parent e9b5857 commit 9d7b1c9

File tree

4 files changed

+40
-51
lines changed

4 files changed

+40
-51
lines changed
 

‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,9 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
380380
FastMathFlags FMF = FPOp->getFastMathFlags();
381381
bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
382382
FMF.allowReciprocal();
383-
if (ST->hasFP32Denormals() && !UnsafeDiv)
383+
384+
// With UnsafeDiv node will be optimized to just rcp and mul.
385+
if (ST->hasFP32Denormals() || UnsafeDiv)
384386
return false;
385387

386388
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);

‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+5-7
Original file line numberDiff line numberDiff line change
@@ -3736,7 +3736,9 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
37363736
SDValue LHS = Op.getOperand(0);
37373737
SDValue RHS = Op.getOperand(1);
37383738
EVT VT = Op.getValueType();
3739-
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
3739+
const SDNodeFlags Flags = Op->getFlags();
3740+
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
3741+
Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
37403742

37413743
if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
37423744
return SDValue();
@@ -3771,15 +3773,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
37713773
}
37723774
}
37733775

3774-
const SDNodeFlags Flags = Op->getFlags();
3775-
3776-
if (Unsafe || Flags.hasAllowReciprocal()) {
3776+
if (Unsafe) {
37773777
// Turn into multiply by the reciprocal.
37783778
// x / y -> x * (1.0 / y)
3779-
SDNodeFlags NewFlags;
3780-
NewFlags.setUnsafeAlgebra(true);
37813779
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
3782-
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, NewFlags);
3780+
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
37833781
}
37843782

37853783
return SDValue();

‎llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

+10-28
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
1616
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
1717
; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
1818
; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
19-
; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
20-
; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
19+
; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
20+
; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
2121
define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
2222
%no.md = fdiv float %a, %b
2323
store volatile float %no.md, float addrspace(1)* %out
@@ -110,15 +110,8 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
110110
; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
111111
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
112112
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
113-
114-
; CHECK: extractelement <2 x float> %x
115-
; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
116-
; CHECK: extractelement <2 x float> %x
117-
; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
118-
; CHECK: store volatile <2 x float> %arcp.25ulp
119-
120-
; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
121-
; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
113+
; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
114+
; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
122115
; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
123116
define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
124117
%no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
@@ -146,17 +139,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
146139
; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
147140
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
148141
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
149-
150-
; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
151-
; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
152-
; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
153-
; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
154-
; CHECK: store volatile <2 x float> %arcp.25ulp
155-
156-
; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
157-
; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
158-
; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
159-
; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
142+
; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
143+
; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
160144
; CHECK: store volatile <2 x float> %fast.25ulp
161145
define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
162146
%no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
@@ -179,12 +163,10 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
179163

180164
; FIXME: Should be able to get fdiv for 1.0 component
181165
; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
182-
; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
183-
; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
166+
; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
184167
; CHECK: store volatile <2 x float> %arcp.25ulp
185168

186-
; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
187-
; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
169+
; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
188170
; CHECK: store volatile <2 x float> %fast.25ulp
189171
define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
190172
%x.insert = insertelement <2 x float> %x, float 1.0, i32 0
@@ -204,8 +186,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
204186
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
205187
; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
206188
; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
207-
; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
208-
; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
189+
; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
190+
; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
209191
define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
210192
%no.md = fdiv float %a, %b
211193
store volatile float %no.md, float addrspace(1)* %out

‎llvm/test/CodeGen/AMDGPU/fdiv.ll

+22-15
Original file line numberDiff line numberDiff line change
@@ -85,20 +85,11 @@ entry:
8585
}
8686

8787
; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
88-
; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
89-
; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
90-
; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
91-
92-
; GCN-NOT: s_setreg
93-
; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
94-
; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
95-
; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
96-
; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
97-
; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
98-
; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
88+
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
89+
; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
90+
; GCN-NOT: [[RESULT]]
9991
; GCN-NOT: s_setreg
100-
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
101-
; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
92+
; GCN: buffer_store_dword [[RESULT]]
10293
define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
10394
entry:
10495
%fdiv = fdiv fast float %a, %b
@@ -121,6 +112,21 @@ entry:
121112
ret void
122113
}
123114

115+
; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
116+
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
117+
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
118+
119+
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
120+
; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
121+
; GCN-NOT: [[RESULT]]
122+
; GCN: buffer_store_dword [[RESULT]]
123+
define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
124+
entry:
125+
%fdiv = fdiv fast float %a, %b, !fpmath !0
126+
store float %fdiv, float addrspace(1)* %out
127+
ret void
128+
}
129+
124130
; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
125131
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
126132
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
@@ -154,8 +160,9 @@ entry:
154160
}
155161

156162
; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
157-
; GCN: v_cmp_gt_f32
158-
; GCN: v_cmp_gt_f32
163+
; GCN: v_rcp_f32
164+
; GCN: v_rcp_f32
165+
; GCN-NOT: v_cmp_gt_f32
159166
define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
160167
entry:
161168
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0

0 commit comments

Comments
 (0)
Please sign in to comment.