Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17803,7 +17803,6 @@ EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. - // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine @@ -17814,12 +17813,15 @@ if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || - (VT == MVT::v8f32 && Subtarget.hasAVX())) { + (VT == MVT::v8f32 && Subtarget.hasAVX()) || + (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; UseOneConstNR = false; - return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + // There is no FSQRT for 512-bits, but there is RSQRT14. + unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; + return DAG.getNode(Opcode, SDLoc(Op), VT, Op); } return SDValue(); } @@ -17832,7 +17834,6 @@ EVT VT = Op.getValueType(); // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. - // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // reciprocal estimate with refinement on x86 prior to FMA requires // 15 instructions: convert to single, rcpss, convert back to double, refine @@ -17841,7 +17842,8 @@ if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1()) || - (VT == MVT::v8f32 && Subtarget.hasAVX())) { + (VT == MVT::v8f32 && Subtarget.hasAVX()) || + (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { // Enable estimate codegen with 1 refinement step for vector division. // Scalar division estimates are disabled because they break too much // real-world code. These defaults are intended to match GCC behavior. @@ -17851,7 +17853,9 @@ if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; - return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + // There is no FSQRT for 512-bits, but there is RSQRT14. + unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP; + return DAG.getNode(Opcode, SDLoc(Op), VT, Op); } return SDValue(); } Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -1024,14 +1024,16 @@ ; ; KNL-LABEL: v16f32_one_step: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] -; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00] +; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] +; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] ; KNL-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: v16f32_one_step: ; SKX: # %bb.0: -; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] -; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] +; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00] +; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] +; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <16 x float> , %x ret <16 x float> %div @@ -1222,14 +1224,24 @@ ; ; KNL-LABEL: v16f32_two_step: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] -; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] +; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50] +; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50] +; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50] +; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50] ; KNL-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: v16f32_two_step: ; SKX: # %bb.0: -; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] -; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] +; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00] +; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] +; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33] +; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.33] +; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.33] +; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.33] +; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <16 x float> , %x ret <16 x float> %div Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -1323,14 +1323,18 @@ ; ; KNL-LABEL: v16f32_one_step2: ; KNL: # %bb.0: -; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50] -; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00] +; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] +; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] ; KNL-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: v16f32_one_step2: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50] -; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] +; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00] +; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] +; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33] +; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <16 x float> , %x ret <16 x float> %div @@ -1485,16 +1489,18 @@ ; ; KNL-LABEL: v16f32_one_step_2_divs: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] -; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00] +; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] +; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50] ; KNL-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50] ; KNL-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: v16f32_one_step_2_divs: ; SKX: # %bb.0: -; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] -; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] +; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00] +; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] +; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33] ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50] ; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1703,14 +1709,26 @@ ; ; KNL-LABEL: v16f32_two_step2: ; KNL: # %bb.0: -; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50] -; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] +; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50] +; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50] +; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50] +; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] ; KNL-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: v16f32_two_step2: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50] -; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] +; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00] +; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] +; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33] +; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.33] +; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.33] +; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.33] +; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.33] +; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <16 x float> , %x ret <16 x float> %div @@ -1763,14 +1781,12 @@ ; ; KNL-LABEL: v16f32_no_step: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] -; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00] ; KNL-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: v16f32_no_step: ; SKX: # %bb.0: -; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] -; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] +; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <16 x float> , %x ret <16 x float> %div @@ -1839,14 +1855,14 @@ ; ; KNL-LABEL: v16f32_no_step2: ; KNL: # %bb.0: -; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50] -; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] ; KNL-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: v16f32_no_step2: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50] -; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] +; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <16 x float> , %x ret <16 x float> %div Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -515,9 +515,11 @@ ; ; AVX512-LABEL: v16f32_estimate: ; AVX512: # %bb.0: -; AVX512-NEXT: vsqrtps %zmm0, %zmm0 -; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem +; AVX512-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x) %div = fdiv fast <16 x float> , %sqrt