@@ -1323,14 +1323,18 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
1323
1323
;
1324
1324
; KNL-LABEL: v16f32_one_step2:
1325
1325
; KNL: # %bb.0:
1326
- ; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
1327
- ; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1326
+ ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1327
+ ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1328
+ ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1329
+ ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
1328
1330
; KNL-NEXT: retq # sched: [7:1.00]
1329
1331
;
1330
1332
; SKX-LABEL: v16f32_one_step2:
1331
1333
; SKX: # %bb.0:
1332
- ; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
1333
- ; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1334
+ ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1335
+ ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1336
+ ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
1337
+ ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
1334
1338
; SKX-NEXT: retq # sched: [7:1.00]
1335
1339
%div = fdiv fast <16 x float > <float 1 .0 , float 2 .0 , float 3 .0 , float 4 .0 , float 5 .0 , float 6 .0 , float 7 .0 , float 8 .0 , float 9 .0 , float 10 .0 , float 11 .0 , float 12 .0 , float 13 .0 , float 14 .0 , float 15 .0 , float 16 .0 >, %x
1336
1340
ret <16 x float > %div
@@ -1485,16 +1489,18 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
1485
1489
;
1486
1490
; KNL-LABEL: v16f32_one_step_2_divs:
1487
1491
; KNL: # %bb.0:
1488
- ; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1489
- ; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1492
+ ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1493
+ ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1494
+ ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1490
1495
; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50]
1491
1496
; KNL-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50]
1492
1497
; KNL-NEXT: retq # sched: [7:1.00]
1493
1498
;
1494
1499
; SKX-LABEL: v16f32_one_step_2_divs:
1495
1500
; SKX: # %bb.0:
1496
- ; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1497
- ; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1501
+ ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1502
+ ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1503
+ ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
1498
1504
; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50]
1499
1505
; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
1500
1506
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1703,14 +1709,26 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
1703
1709
;
1704
1710
; KNL-LABEL: v16f32_two_step2:
1705
1711
; KNL: # %bb.0:
1706
- ; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
1707
- ; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1712
+ ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
1713
+ ; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1714
+ ; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00]
1715
+ ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
1716
+ ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
1717
+ ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
1718
+ ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
1719
+ ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
1708
1720
; KNL-NEXT: retq # sched: [7:1.00]
1709
1721
;
1710
1722
; SKX-LABEL: v16f32_two_step2:
1711
1723
; SKX: # %bb.0:
1712
- ; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
1713
- ; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1724
+ ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
1725
+ ; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1726
+ ; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33]
1727
+ ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.33]
1728
+ ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.33]
1729
+ ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.33]
1730
+ ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.33]
1731
+ ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
1714
1732
; SKX-NEXT: retq # sched: [7:1.00]
1715
1733
%div = fdiv fast <16 x float > <float 1 .0 , float 2 .0 , float 3 .0 , float 4 .0 , float 5 .0 , float 6 .0 , float 7 .0 , float 8 .0 , float 9 .0 , float 10 .0 , float 11 .0 , float 12 .0 , float 13 .0 , float 14 .0 , float 15 .0 , float 16 .0 >, %x
1716
1734
ret <16 x float > %div
@@ -1763,14 +1781,12 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
1763
1781
;
1764
1782
; KNL-LABEL: v16f32_no_step:
1765
1783
; KNL: # %bb.0:
1766
- ; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1767
- ; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
1784
+ ; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00]
1768
1785
; KNL-NEXT: retq # sched: [7:1.00]
1769
1786
;
1770
1787
; SKX-LABEL: v16f32_no_step:
1771
1788
; SKX: # %bb.0:
1772
- ; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1773
- ; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
1789
+ ; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00]
1774
1790
; SKX-NEXT: retq # sched: [7:1.00]
1775
1791
%div = fdiv fast <16 x float > <float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 , float 1 .0 >, %x
1776
1792
ret <16 x float > %div
@@ -1839,14 +1855,14 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
1839
1855
;
1840
1856
; KNL-LABEL: v16f32_no_step2:
1841
1857
; KNL: # %bb.0:
1842
- ; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50 ]
1843
- ; KNL-NEXT: vdivps %zmm0 , %zmm1 , %zmm0 # sched: [12:1.00 ]
1858
+ ; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00 ]
1859
+ ; KNL-NEXT: vmulps {{.*}}(%rip) , %zmm0 , %zmm0 # sched: [12:0.50 ]
1844
1860
; KNL-NEXT: retq # sched: [7:1.00]
1845
1861
;
1846
1862
; SKX-LABEL: v16f32_no_step2:
1847
1863
; SKX: # %bb.0:
1848
- ; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50 ]
1849
- ; SKX-NEXT: vdivps %zmm0 , %zmm1 , %zmm0 # sched: [18:10.00 ]
1864
+ ; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00 ]
1865
+ ; SKX-NEXT: vmulps {{.*}}(%rip) , %zmm0 , %zmm0 # sched: [11:0.50 ]
1850
1866
; SKX-NEXT: retq # sched: [7:1.00]
1851
1867
%div = fdiv fast <16 x float > <float 1 .0 , float 2 .0 , float 3 .0 , float 4 .0 , float 5 .0 , float 6 .0 , float 7 .0 , float 8 .0 , float 9 .0 , float 10 .0 , float 11 .0 , float 12 .0 , float 13 .0 , float 14 .0 , float 15 .0 , float 16 .0 >, %x
1852
1868
ret <16 x float > %div
0 commit comments