Skip to content

Commit fb9a530

Browse files
committedApr 28, 2019
[DAGCombiner] try repeated fdiv divisor transform before building estimate
This was originally part of D61028, but it's an independent diff. If we try the repeated divisor reciprocal transform before producing an estimate sequence, then we have an opportunity to use scalar fdiv. On x86, the trade-off is 1 divss vs. 5 vector FP ops in the default estimate sequence. On recent chips (Skylake, Ryzen), the full-precision division is only 3 cycle throughput, so that's probably the better perf default option and avoids problems from x86's inaccurate estimates. The last 2 tests show that users still have the option to override the defaults by using the function attributes for reciprocal estimates, but those patterns are potentially made faster by converting the vector ops (including ymm ops) to scalar math. Differential Revision: https://reviews.llvm.org/D61149 llvm-svn: 359398
1 parent 43003f0 commit fb9a530

File tree

2 files changed

+30
-42
lines changed

2 files changed

+30
-42
lines changed
 

‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -11992,6 +11992,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
1199211992
if (SDValue NewSel = foldBinOpIntoSelect(N))
1199311993
return NewSel;
1199411994

11995+
if (SDValue V = combineRepeatedFPDivisors(N))
11996+
return V;
11997+
1199511998
if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
1199611999
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
1199712000
if (N1CFP) {
@@ -12081,9 +12084,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
1208112084
}
1208212085
}
1208312086

12084-
if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
12085-
return CombineRepeatedDivisors;
12086-
1208712087
return SDValue();
1208812088
}
1208912089

‎llvm/test/CodeGen/X86/fdiv-combine-vec.ll

+27-39
Original file line numberDiff line numberDiff line change
@@ -51,25 +51,17 @@ define <4 x double> @splat_fdiv_v4f64(<4 x double> %x, double %y) {
5151
define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
5252
; SSE-LABEL: splat_fdiv_v4f32:
5353
; SSE: # %bb.0:
54-
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
55-
; SSE-NEXT: rcpps %xmm1, %xmm2
56-
; SSE-NEXT: mulps %xmm2, %xmm1
57-
; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
58-
; SSE-NEXT: subps %xmm1, %xmm3
59-
; SSE-NEXT: mulps %xmm2, %xmm3
60-
; SSE-NEXT: addps %xmm2, %xmm3
61-
; SSE-NEXT: mulps %xmm3, %xmm0
54+
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
55+
; SSE-NEXT: divss %xmm1, %xmm2
56+
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
57+
; SSE-NEXT: mulps %xmm2, %xmm0
6258
; SSE-NEXT: retq
6359
;
6460
; AVX-LABEL: splat_fdiv_v4f32:
6561
; AVX: # %bb.0:
62+
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
63+
; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
6664
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
67-
; AVX-NEXT: vrcpps %xmm1, %xmm2
68-
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
69-
; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
70-
; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
71-
; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
72-
; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
7365
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
7466
; AVX-NEXT: retq
7567
%vy = insertelement <4 x float> undef, float %y, i32 0
@@ -90,14 +82,10 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
9082
;
9183
; AVX-LABEL: splat_fdiv_v8f32:
9284
; AVX: # %bb.0:
85+
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
86+
; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
9387
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
9488
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
95-
; AVX-NEXT: vrcpps %ymm1, %ymm2
96-
; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
97-
; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
98-
; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
99-
; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
100-
; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
10189
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
10290
; AVX-NEXT: retq
10391
%vy = insertelement <8 x float> undef, float %y, i32 0
@@ -109,25 +97,25 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
10997
define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
11098
; SSE-LABEL: splat_fdiv_v4f32_estimate:
11199
; SSE: # %bb.0:
112-
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
113-
; SSE-NEXT: rcpps %xmm1, %xmm2
114-
; SSE-NEXT: mulps %xmm2, %xmm1
115-
; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
116-
; SSE-NEXT: subps %xmm1, %xmm3
117-
; SSE-NEXT: mulps %xmm2, %xmm3
118-
; SSE-NEXT: addps %xmm2, %xmm3
100+
; SSE-NEXT: rcpss %xmm1, %xmm2
101+
; SSE-NEXT: mulss %xmm2, %xmm1
102+
; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
103+
; SSE-NEXT: subss %xmm1, %xmm3
104+
; SSE-NEXT: mulss %xmm2, %xmm3
105+
; SSE-NEXT: addss %xmm2, %xmm3
106+
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,0,0]
119107
; SSE-NEXT: mulps %xmm3, %xmm0
120108
; SSE-NEXT: retq
121109
;
122110
; AVX-LABEL: splat_fdiv_v4f32_estimate:
123111
; AVX: # %bb.0:
112+
; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
113+
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
114+
; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
115+
; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
116+
; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
117+
; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
124118
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
125-
; AVX-NEXT: vrcpps %xmm1, %xmm2
126-
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
127-
; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
128-
; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
129-
; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
130-
; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
131119
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
132120
; AVX-NEXT: retq
133121
%vy = insertelement <4 x float> undef, float %y, i32 0
@@ -152,14 +140,14 @@ define <8 x float> @splat_fdiv_v8f32_estimate(<8 x float> %x, float %y) #0 {
152140
;
153141
; AVX-LABEL: splat_fdiv_v8f32_estimate:
154142
; AVX: # %bb.0:
143+
; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
144+
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
145+
; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
146+
; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
147+
; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
148+
; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
155149
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
156150
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
157-
; AVX-NEXT: vrcpps %ymm1, %ymm2
158-
; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
159-
; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
160-
; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
161-
; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
162-
; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
163151
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
164152
; AVX-NEXT: retq
165153
%vy = insertelement <8 x float> undef, float %y, i32 0

0 commit comments

Comments
 (0)