Skip to content

Commit cb2abc7

Browse files
committedMay 6, 2018
[X86] Enable reciprocal estimates for v16f32 vectors by using VRCP14PS/VRSQRT14PS
Summary: The legacy VRCPPS/VRSQRTPS instructions aren't available in 512-bit versions. The new increased precision versions are. So we can use those to implement v16f32 reciprocal estimates. For KNL CPUs we can probably use VRCP28PS/VRSQRT28PS and avoid the NR step altogether, but I leave that for a future patch. Reviewers: spatel Reviewed By: spatel Subscribers: RKSimon, llvm-commits, mehdi_amini Differential Revision: https://reviews.llvm.org/D46498 llvm-svn: 331606
1 parent b02e3de commit cb2abc7

File tree

4 files changed

+71
-37
lines changed

4 files changed

+71
-37
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+10-6
Original file line numberDiff line numberDiff line change
@@ -17803,7 +17803,6 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
1780317803
EVT VT = Op.getValueType();
1780417804

1780517805
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17806-
// TODO: Add support for AVX512 (v16f32).
1780717806
// It is likely not profitable to do this for f64 because a double-precision
1780817807
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
1780917808
// instructions: convert to single, rsqrtss, convert back to double, refine
@@ -17814,12 +17813,15 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
1781417813
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
1781517814
(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
1781617815
(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17817-
(VT == MVT::v8f32 && Subtarget.hasAVX())) {
17816+
(VT == MVT::v8f32 && Subtarget.hasAVX()) ||
17817+
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
1781817818
if (RefinementSteps == ReciprocalEstimate::Unspecified)
1781917819
RefinementSteps = 1;
1782017820

1782117821
UseOneConstNR = false;
17822-
return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17822+
// There is no FSQRT for 512-bits, but there is RSQRT14.
17823+
unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
17824+
return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
1782317825
}
1782417826
return SDValue();
1782517827
}
@@ -17832,7 +17834,6 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
1783217834
EVT VT = Op.getValueType();
1783317835

1783417836
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17835-
// TODO: Add support for AVX512 (v16f32).
1783617837
// It is likely not profitable to do this for f64 because a double-precision
1783717838
// reciprocal estimate with refinement on x86 prior to FMA requires
1783817839
// 15 instructions: convert to single, rcpss, convert back to double, refine
@@ -17841,7 +17842,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
1784117842

1784217843
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
1784317844
(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17844-
(VT == MVT::v8f32 && Subtarget.hasAVX())) {
17845+
(VT == MVT::v8f32 && Subtarget.hasAVX()) ||
17846+
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
1784517847
// Enable estimate codegen with 1 refinement step for vector division.
1784617848
// Scalar division estimates are disabled because they break too much
1784717849
// real-world code. These defaults are intended to match GCC behavior.
@@ -17851,7 +17853,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
1785117853
if (RefinementSteps == ReciprocalEstimate::Unspecified)
1785217854
RefinementSteps = 1;
1785317855

17854-
return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17856+
// There is no FSQRT for 512-bits, but there is RSQRT14.
17857+
unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
17858+
return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
1785517859
}
1785617860
return SDValue();
1785717861
}

‎llvm/test/CodeGen/X86/recip-fastmath.ll

+20-8
Original file line numberDiff line numberDiff line change
@@ -1024,14 +1024,16 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
10241024
;
10251025
; KNL-LABEL: v16f32_one_step:
10261026
; KNL: # %bb.0:
1027-
; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1028-
; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1027+
; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1028+
; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1029+
; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
10291030
; KNL-NEXT: retq # sched: [7:1.00]
10301031
;
10311032
; SKX-LABEL: v16f32_one_step:
10321033
; SKX: # %bb.0:
1033-
; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1034-
; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1034+
; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1035+
; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1036+
; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
10351037
; SKX-NEXT: retq # sched: [7:1.00]
10361038
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
10371039
ret <16 x float> %div
@@ -1222,14 +1224,24 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
12221224
;
12231225
; KNL-LABEL: v16f32_two_step:
12241226
; KNL: # %bb.0:
1225-
; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1226-
; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1227+
; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1228+
; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1229+
; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00]
1230+
; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
1231+
; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
1232+
; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
1233+
; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
12271234
; KNL-NEXT: retq # sched: [7:1.00]
12281235
;
12291236
; SKX-LABEL: v16f32_two_step:
12301237
; SKX: # %bb.0:
1231-
; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1232-
; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1238+
; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1239+
; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1240+
; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33]
1241+
; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.33]
1242+
; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.33]
1243+
; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.33]
1244+
; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.33]
12331245
; SKX-NEXT: retq # sched: [7:1.00]
12341246
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
12351247
ret <16 x float> %div

‎llvm/test/CodeGen/X86/recip-fastmath2.ll

+36-20
Original file line numberDiff line numberDiff line change
@@ -1323,14 +1323,18 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
13231323
;
13241324
; KNL-LABEL: v16f32_one_step2:
13251325
; KNL: # %bb.0:
1326-
; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
1327-
; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1326+
; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1327+
; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1328+
; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1329+
; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
13281330
; KNL-NEXT: retq # sched: [7:1.00]
13291331
;
13301332
; SKX-LABEL: v16f32_one_step2:
13311333
; SKX: # %bb.0:
1332-
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
1333-
; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1334+
; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1335+
; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1336+
; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
1337+
; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
13341338
; SKX-NEXT: retq # sched: [7:1.00]
13351339
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
13361340
ret <16 x float> %div
@@ -1485,16 +1489,18 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
14851489
;
14861490
; KNL-LABEL: v16f32_one_step_2_divs:
14871491
; KNL: # %bb.0:
1488-
; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1489-
; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1492+
; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1493+
; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1494+
; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
14901495
; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50]
14911496
; KNL-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50]
14921497
; KNL-NEXT: retq # sched: [7:1.00]
14931498
;
14941499
; SKX-LABEL: v16f32_one_step_2_divs:
14951500
; SKX: # %bb.0:
1496-
; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1497-
; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1501+
; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1502+
; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1503+
; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
14981504
; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50]
14991505
; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
15001506
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1703,14 +1709,26 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
17031709
;
17041710
; KNL-LABEL: v16f32_two_step2:
17051711
; KNL: # %bb.0:
1706-
; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
1707-
; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1712+
; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1713+
; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1714+
; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00]
1715+
; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
1716+
; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
1717+
; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
1718+
; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
1719+
; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
17081720
; KNL-NEXT: retq # sched: [7:1.00]
17091721
;
17101722
; SKX-LABEL: v16f32_two_step2:
17111723
; SKX: # %bb.0:
1712-
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
1713-
; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1724+
; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1725+
; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1726+
; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33]
1727+
; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.33]
1728+
; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.33]
1729+
; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.33]
1730+
; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.33]
1731+
; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
17141732
; SKX-NEXT: retq # sched: [7:1.00]
17151733
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
17161734
ret <16 x float> %div
@@ -1763,14 +1781,12 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
17631781
;
17641782
; KNL-LABEL: v16f32_no_step:
17651783
; KNL: # %bb.0:
1766-
; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1767-
; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1784+
; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00]
17681785
; KNL-NEXT: retq # sched: [7:1.00]
17691786
;
17701787
; SKX-LABEL: v16f32_no_step:
17711788
; SKX: # %bb.0:
1772-
; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1773-
; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1789+
; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00]
17741790
; SKX-NEXT: retq # sched: [7:1.00]
17751791
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
17761792
ret <16 x float> %div
@@ -1839,14 +1855,14 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
18391855
;
18401856
; KNL-LABEL: v16f32_no_step2:
18411857
; KNL: # %bb.0:
1842-
; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
1843-
; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1858+
; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00]
1859+
; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
18441860
; KNL-NEXT: retq # sched: [7:1.00]
18451861
;
18461862
; SKX-LABEL: v16f32_no_step2:
18471863
; SKX: # %bb.0:
1848-
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
1849-
; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1864+
; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00]
1865+
; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
18501866
; SKX-NEXT: retq # sched: [7:1.00]
18511867
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
18521868
ret <16 x float> %div

‎llvm/test/CodeGen/X86/sqrt-fastmath.ll

+5-3
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
515515
;
516516
; AVX512-LABEL: v16f32_estimate:
517517
; AVX512: # %bb.0:
518-
; AVX512-NEXT: vsqrtps %zmm0, %zmm0
519-
; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
520-
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
518+
; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1
519+
; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
520+
; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
521+
; AVX512-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm1, %zmm1
522+
; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
521523
; AVX512-NEXT: retq
522524
%sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
523525
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt

0 commit comments

Comments
 (0)
Please sign in to comment.