diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -530,7 +530,7 @@ SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); SDValue BuildLogBase2(SDValue V, const SDLoc &DL); - SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags); + SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags); SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); @@ -12684,10 +12684,8 @@ } // Fold into a reciprocal estimate and multiply instead of a real divide. - if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) { - AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); - } + if (SDValue RV = BuildDivEstimate(N0, N1, Flags)) + return RV; } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) @@ -20347,7 +20345,10 @@ /// => /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form /// does not require additional intermediate precision] -SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) { +/// For the last iteration, put numerator N into it to gain more precision: +/// Result = N X_i + X_i (N - N A X_i) +SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op, + SDNodeFlags Flags) { if (Level >= AfterLegalizeDAG) return SDValue(); @@ -20368,18 +20369,39 @@ if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { AddToWorklist(Est.getNode()); + SDLoc DL(Op); if (Iterations) { - SDLoc DL(Op); SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); - // Newton iterations: Est = Est + Est (1 - Arg * Est) + // Newton iterations: Est = Est + Est (N - Arg * Est) + // If this is the last iteration, also multiply by the numerator. for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags); - NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags); + SDValue MulEst = Est; + + if (i == Iterations - 1) { + MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags); + AddToWorklist(MulEst.getNode()); + } + + SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags); + AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, DL, VT, + (i == Iterations - 1 ? N : FPOne), NewEst, Flags); + AddToWorklist(NewEst.getNode()); + NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); - Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags); + AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags); + AddToWorklist(Est.getNode()); } + } else { + // If no iterations are available, multiply with N. + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags); + AddToWorklist(Est.getNode()); } + return Est; } diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -114,7 +114,7 @@ ; FUNC-LABEL: {{^}}fdiv_f32_fast_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z, ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] @@ -129,7 +129,7 @@ ; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z, ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] @@ -144,7 +144,7 @@ ; FUNC-LABEL: {{^}}fdiv_f32_arcp_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z, ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] @@ -188,8 +188,8 @@ ; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W, ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 @@ -203,8 +203,8 @@ ; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W, ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 @@ -243,10 +243,10 @@ ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 @@ -266,10 +266,10 @@ ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll --- a/llvm/test/CodeGen/AMDGPU/rsq.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.ll @@ -48,7 +48,7 @@ ; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]] ; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]] -; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RSQA]], [[RCPB]] +; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]] ; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI-UNSAFE: buffer_store_dword [[RESULT]] diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll --- a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll +++ b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll @@ -229,11 +229,11 @@ ; CHECK-NEXT: qvfre 3, 2 ; CHECK-NEXT: addi 3, 3, .LCPI8_0@toc@l ; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfnmsub 4, 2, 3, 0 -; CHECK-NEXT: qvfmadd 3, 3, 4, 3 ; CHECK-NEXT: qvfnmsub 0, 2, 3, 0 ; CHECK-NEXT: qvfmadd 0, 3, 0, 3 -; CHECK-NEXT: qvfmul 1, 1, 0 +; CHECK-NEXT: qvfmul 3, 1, 0 +; CHECK-NEXT: qvfnmsub 1, 2, 3, 1 +; CHECK-NEXT: qvfmadd 1, 0, 1, 3 ; CHECK-NEXT: blr entry: %r = fdiv fast <4 x double> %a, %b @@ -266,13 +266,10 @@ define <4 x float> @goo2_fmf(<4 x float> %a, <4 x float> %b) nounwind { ; CHECK-LABEL: goo2_fmf: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI10_0@toc@ha -; CHECK-NEXT: qvfres 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI10_0@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: qvfnmsubs 0, 2, 3, 0 -; CHECK-NEXT: qvfmadds 0, 3, 0, 3 -; CHECK-NEXT: qvfmuls 1, 1, 0 +; CHECK-NEXT: qvfres 0, 2 +; CHECK-NEXT: qvfmuls 3, 1, 0 +; CHECK-NEXT: qvfnmsubs 1, 2, 3, 1 +; CHECK-NEXT: qvfmadds 1, 0, 1, 3 ; CHECK-NEXT: blr entry: %r = fdiv fast <4 x float> %a, %b diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -145,13 +145,13 @@ define float @rsqrt_fmul_fmf(float %a, float %b, float %c) { ; CHECK: @rsqrt_fmul_fmf ; CHECK-DAG: frsqrtes -; CHECK-DAG: fres -; CHECK-DAG: fnmsubs -; CHECK-DAG: fmuls -; CHECK-DAG: fmadds -; CHECK-DAG: fmadds ; CHECK: fmuls +; CHECK-NEXT: fmadds ; CHECK-NEXT: fmuls +; CHECK-DAG: fres +; CHECK-COUNT-3: fmuls +; CHECK-NEXT: fmsubs +; CHECK-NEXT: fmadds ; CHECK-NEXT: fmuls ; CHECK-NEXT: blr %x = call fast float @llvm.sqrt.f32(float %a) @@ -196,9 +196,9 @@ ; CHECK-DAG: fre ; CHECK-DAG: fnmsub ; CHECK: fmadd +; CHECK-NEXT: fmul ; CHECK-NEXT: fnmsub ; CHECK-NEXT: fmadd -; CHECK-NEXT: fmul ; CHECK-NEXT: blr %r = fdiv fast double %a, %b ret double %r @@ -215,9 +215,9 @@ define float @goo2_fmf(float %a, float %b) nounwind { ; CHECK: @goo2_fmf ; CHECK-DAG: fres +; CHECK-NEXT: fmuls ; CHECK-DAG: fnmsubs ; CHECK: fmadds -; CHECK-NEXT: fmuls ; CHECK-NEXT: blr %r = fdiv fast float %a, %b ret float %r diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -122,88 +122,87 @@ ; SSE-LABEL: f32_one_step_variables: ; SSE: # %bb.0: ; SSE-NEXT: rcpss %xmm1, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: addss %xmm2, %xmm3 -; SSE-NEXT: mulss %xmm3, %xmm0 +; SSE-NEXT: mulss %xmm3, %xmm1 +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_variables: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_variables: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; FMA-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step_variables: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm2, %xmm1, %xmm1 -; BDVER2-NEXT: vfmaddss %xmm2, %xmm1, %xmm2, %xmm1 -; BDVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm0, %xmm3, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm2, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_variables: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_variables: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; SANDY-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_variables: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_variables: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_variables: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; AVX512-NEXT: vmovaps %xmm2, %xmm0 ; AVX512-NEXT: retq %div = fdiv fast float %x, %y ret float %div @@ -484,99 +483,88 @@ ; SSE-LABEL: v4f32_one_step_variables: ; SSE: # %bb.0: ; SSE-NEXT: rcpps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: addps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: addps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step_variables: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2 -; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step_variables: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; FMA-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_one_step_variables: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm1, %xmm2 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm2, %xmm1, %xmm1 -; BDVER2-NEXT: vfmaddps %xmm2, %xmm1, %xmm2, %xmm1 -; BDVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; BDVER2-NEXT: vfnmaddps %xmm0, %xmm3, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm2, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step_variables: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm1, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step_variables: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm1, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; SANDY-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step_variables: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm1, %xmm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; KNL-LABEL: v4f32_one_step_variables: -; KNL: # %bb.0: -; KNL-NEXT: vrcpps %xmm1, %xmm2 -; KNL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3 -; KNL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; KNL-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: v4f32_one_step_variables: -; SKX: # %bb.0: -; SKX-NEXT: vrcpps %xmm1, %xmm2 -; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; SKX-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; SKX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SKX-NEXT: retq +; AVX512-LABEL: v4f32_one_step_variables: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %xmm1, %xmm2 +; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq %div = fdiv fast <4 x float> %x, %y ret <4 x float> %div } diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -32,89 +32,94 @@ ; SSE-LABEL: f32_one_step_2: ; SSE: # %bb.0: ; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: mulss %xmm1, %xmm3 +; SSE-NEXT: mulss %xmm3, %xmm0 ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: addss %xmm2, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 +; SSE-NEXT: addss %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step_2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; BDVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm2, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HASWELL-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast float 3456.0, %x ret float %div @@ -225,19 +230,20 @@ define float @f32_two_step_2(float %x) #2 { ; SSE-LABEL: f32_two_step_2: ; SSE: # %bb.0: -; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulss %xmm2, %xmm3 +; SSE-NEXT: rcpss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: mulss %xmm1, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm2, %xmm3 +; SSE-NEXT: mulss %xmm1, %xmm3 +; SSE-NEXT: addss %xmm1, %xmm3 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subss %xmm3, %xmm4 -; SSE-NEXT: mulss %xmm2, %xmm4 -; SSE-NEXT: addss %xmm2, %xmm4 -; SSE-NEXT: mulss %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: mulss %xmm1, %xmm2 +; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: subss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm4, %xmm1 -; SSE-NEXT: addss %xmm4, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 +; SSE-NEXT: mulss %xmm3, %xmm1 +; SSE-NEXT: addss %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -249,49 +255,51 @@ ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_two_step_2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_two_step_2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 -; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm2 +; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; BDVER2-NEXT: vfmaddss %xmm1, %xmm2, %xmm1, %xmm1 +; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm4, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_two_step_2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; BTVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; BTVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 +; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; BTVER2-NEXT: vsubss %xmm0, %xmm4, %xmm0 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_two_step_2: @@ -302,23 +310,24 @@ ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vmulss %xmm1, %xmm2, %xmm3 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: @@ -329,23 +338,24 @@ ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_two_step_2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovaps %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast float 6789.0, %x ret float %div @@ -355,100 +365,95 @@ ; SSE-LABEL: v4f32_one_step2: ; SSE: # %bb.0: ; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm3, %xmm0 ; SSE-NEXT: subps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; FMA-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_one_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BDVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddps %xmm2, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; KNL-LABEL: v4f32_one_step2: -; KNL: # %bb.0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 -; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: v4f32_one_step2: -; SKX: # %bb.0: -; SKX-NEXT: vrcpps %xmm0, %xmm1 -; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: retq +; AVX512-LABEL: v4f32_one_step2: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %xmm0, %xmm1 +; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 +; AVX512-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -569,19 +574,20 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SSE-LABEL: v4f32_two_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subps %xmm3, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: subps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: addps %xmm4, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: addps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -593,49 +599,51 @@ ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; FMA-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 -; BDVER2-NEXT: vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 -; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm2 +; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BDVER2-NEXT: vfmaddps %xmm1, %xmm2, %xmm1, %xmm1 +; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddps %xmm4, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 +; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; BTVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 +; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; BTVER2-NEXT: vsubps %xmm0, %xmm4, %xmm0 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_two_step2: @@ -646,23 +654,24 @@ ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NEXT: vmulps %xmm1, %xmm2, %xmm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: @@ -673,23 +682,24 @@ ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v4f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %xmm0, %xmm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vmovaps %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; AVX512-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -698,20 +708,22 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm1, %xmm4 +; SSE-NEXT: rcpps %xmm0, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: mulps %xmm2, %xmm4 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: subps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm3, %xmm2 +; SSE-NEXT: addps %xmm4, %xmm2 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm4, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm4, %xmm3 -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: retq @@ -719,88 +731,82 @@ ; AVX-RECIP-LABEL: v8f32_one_step2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v8f32_one_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; BDVER2-NEXT: vfnmaddps %ymm2, %ymm3, %ymm0, %ymm0 +; BDVER2-NEXT: vfmaddps %ymm3, %ymm0, %ymm1, %ymm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; -; KNL-LABEL: v8f32_one_step2: -; KNL: # %bb.0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 -; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq -; -; SKX-LABEL: v8f32_one_step2: -; SKX: # %bb.0: -; SKX-NEXT: vrcpps %ymm0, %ymm1 -; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem -; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; AVX512-LABEL: v8f32_one_step2: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %ymm0, %ymm1 +; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 +; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3 +; AVX512-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -930,33 +936,35 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SSE-LABEL: v8f32_two_step2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm3, %xmm5 -; SSE-NEXT: addps %xmm3, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: addps %xmm5, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm1, %xmm5 -; SSE-NEXT: addps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: subps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: subps %xmm0, %xmm2 ; SSE-NEXT: mulps %xmm5, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm5, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 +; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm4 +; SSE-NEXT: mulps %xmm0, %xmm4 +; SSE-NEXT: addps %xmm0, %xmm4 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm4, %xmm3 +; SSE-NEXT: addps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: retq ; @@ -968,49 +976,51 @@ ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 +; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v8f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 -; BDVER2-NEXT: vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 -; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm2 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vfmaddps %ymm1, %ymm2, %ymm1, %ymm1 +; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 +; BDVER2-NEXT: vfnmaddps %ymm4, %ymm3, %ymm0, %ymm0 +; BDVER2-NEXT: vfmaddps %ymm3, %ymm0, %ymm1, %ymm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 +; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_two_step2: @@ -1021,23 +1031,24 @@ ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 +; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: @@ -1048,23 +1059,24 @@ ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v8f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %ymm0, %ymm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vmovaps %ymm1, %ymm3 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 -; AVX512-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 +; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX512-NEXT: vmulps %ymm1, %ymm2, %ymm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3 ; AVX512-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -1088,9 +1100,9 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 ; SSE-NEXT: retq ; @@ -1106,153 +1118,165 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_one_step2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: rcpps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm4, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: addps %xmm2, %xmm3 -; SSE-NEXT: rcpps %xmm5, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subps %xmm5, %xmm2 -; SSE-NEXT: mulps %xmm4, %xmm2 -; SSE-NEXT: addps %xmm4, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: addps %xmm6, %xmm0 +; SSE-NEXT: rcpps %xmm4, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm1, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm4 +; SSE-NEXT: subps %xmm4, %xmm1 ; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subps %xmm1, %xmm4 +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: subps %xmm2, %xmm4 ; SSE-NEXT: mulps %xmm5, %xmm4 -; SSE-NEXT: addps %xmm5, %xmm4 -; SSE-NEXT: rcpps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: subps %xmm6, %xmm0 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm4 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: addps %xmm6, %xmm4 +; SSE-NEXT: rcpps %xmm3, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: mulps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: addps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_one_step2: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX-RECIP-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm4, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_one_step2: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4 +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + ymm4 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_one_step2: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vrcpps %ymm0, %ymm4 -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 +; BDVER2-NEXT: vrcpps %ymm0, %ymm2 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vrcpps %ymm1, %ymm5 +; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 -; BDVER2-NEXT: vfmaddps %ymm4, %ymm0, %ymm4, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BDVER2-NEXT: vfmaddps %ymm4, %ymm0, %ymm2, %ymm0 +; BDVER2-NEXT: vmulps %ymm3, %ymm5, %ymm4 +; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1 +; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm5, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vrcpps %ymm1, %ymm2 -; BTVER2-NEXT: vrcpps %ymm0, %ymm4 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; BTVER2-NEXT: vmulps %ymm0, %ymm4, %ymm0 -; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; BTVER2-NEXT: vrcpps %ymm1, %ymm2 +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 ; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; BTVER2-NEXT: vmulps %ymm5, %ymm1, %ymm1 +; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vaddps %ymm1, %ymm5, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_one_step2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %ymm1, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm2, %ymm1 +; SANDY-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_one_step2: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vrcpps %ymm0, %ymm4 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 +; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4 +; HASWELL-NEXT: vrcpps %ymm1, %ymm2 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + ymm4 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v16f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm4, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm2, %ymm1 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v16f32_one_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem -; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 -; AVX512-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vmovaps {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm3 ; AVX512-NEXT: retq %div = fdiv fast <16 x float> , %x ret <16 x float> %div @@ -1436,228 +1460,245 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; SSE-LABEL: v16f32_two_step2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: rcpps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm3, %xmm7 -; SSE-NEXT: mulps %xmm2, %xmm7 -; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm6, %xmm3 -; SSE-NEXT: mulps %xmm7, %xmm3 -; SSE-NEXT: addps %xmm7, %xmm3 -; SSE-NEXT: rcpps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: mulps %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm6, %xmm7 -; SSE-NEXT: mulps %xmm2, %xmm7 -; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subps %xmm5, %xmm2 -; SSE-NEXT: mulps %xmm7, %xmm2 -; SSE-NEXT: addps %xmm7, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: mulps %xmm5, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm6, %xmm7 -; SSE-NEXT: mulps %xmm5, %xmm7 -; SSE-NEXT: addps %xmm5, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm1, %xmm5 -; SSE-NEXT: mulps %xmm7, %xmm5 -; SSE-NEXT: addps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: subps %xmm5, %xmm7 +; SSE-NEXT: mulps %xmm0, %xmm7 +; SSE-NEXT: addps %xmm0, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm7, %xmm0 +; SSE-NEXT: addps %xmm5, %xmm0 ; SSE-NEXT: rcpps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm6, %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: mulps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: subps %xmm5, %xmm7 ; SSE-NEXT: mulps %xmm1, %xmm7 ; SSE-NEXT: addps %xmm1, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: mulps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm5, %xmm4 +; SSE-NEXT: subps %xmm4, %xmm1 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: addps %xmm5, %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: mulps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: subps %xmm5, %xmm7 +; SSE-NEXT: mulps %xmm4, %xmm7 +; SSE-NEXT: addps %xmm4, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: mulps %xmm4, %xmm5 +; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: subps %xmm2, %xmm4 ; SSE-NEXT: mulps %xmm7, %xmm4 -; SSE-NEXT: subps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm7, %xmm0 -; SSE-NEXT: addps %xmm7, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm5 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: addps %xmm5, %xmm4 +; SSE-NEXT: rcpps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: subps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm2, %xmm6 +; SSE-NEXT: addps %xmm2, %xmm6 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_two_step2: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; AVX-RECIP-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_two_step2: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm2, %ymm4, %ymm5 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5 +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm2) + ymm3 +; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm2) + ymm2 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; FMA-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm4 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_two_step2: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5 +; BDVER2-NEXT: vfnmaddps %ymm4, %ymm5, %ymm0, %ymm0 +; BDVER2-NEXT: vfmaddps %ymm5, %ymm0, %ymm2, %ymm0 +; BDVER2-NEXT: vrcpps %ymm1, %ymm2 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm3 +; BDVER2-NEXT: vfmaddps %ymm2, %ymm3, %ymm2, %ymm2 +; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 +; BDVER2-NEXT: vfnmaddps %ymm5, %ymm4, %ymm1, %ymm1 +; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm2, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_two_step2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vrcpps %ymm1, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; BTVER2-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm1, %ymm2 +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 +; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 +; BTVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 +; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; BTVER2-NEXT: vsubps %ymm1, %ymm5, %ymm1 +; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_two_step2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %ymm1, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm2 +; SANDY-NEXT: vsubps %ymm2, %ymm4, %ymm2 +; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; SANDY-NEXT: vaddps %ymm2, %ymm3, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_two_step2: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vmovaps %ymm2, %ymm4 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm2, %ymm4, %ymm5 +; HASWELL-NEXT: vrcpps %ymm1, %ymm6 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5 +; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm6) + ymm3 +; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm6) + ymm6 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm4 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v16f32_two_step2: ; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; HASWELL-NO-FMA-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm1, %ymm2 +; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm4, %ymm2 +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm3, %ymm2 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v16f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vmovaps %zmm1, %zmm3 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 -; AVX512-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 +; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm2 = (zmm2 * zmm1) + zmm1 +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX512-NEXT: vmulps %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm3 ; AVX512-NEXT: retq %div = fdiv fast <16 x float> , %x ret <16 x float> %div @@ -1725,68 +1766,68 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 { ; SSE-LABEL: v16f32_no_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm3, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm2 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 +; SSE-NEXT: rcpps %xmm3, %xmm3 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_no_step2: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_no_step2: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_no_step2: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vrcpps %ymm1, %ymm1 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0 +; BDVER2-NEXT: vrcpps %ymm1, %ymm1 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_no_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vrcpps %ymm1, %ymm1 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm1, %ymm1 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_no_step2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %ymm1, %ymm1 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm1, %ymm1 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_no_step2: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v16f32_no_step2: ; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: retq