Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17452,36 +17452,35 @@ Est = UseOneConstNR ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); - - if (!Reciprocal) { - // The estimate is now completely wrong if the input was exactly 0.0 or - // possibly a denormal. Force the answer to 0.0 for those cases. - EVT VT = Op.getValueType(); - SDLoc DL(Op); - EVT CCVT = getSetCCResultType(VT); - ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - const Function &F = DAG.getMachineFunction().getFunction(); - Attribute Denorms = F.getFnAttribute("denormal-fp-math"); - if (Denorms.getValueAsString().equals("ieee")) { - // fabs(X) < SmallestNormal ? 0.0 : Est - const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); - APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); - SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); - SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); - Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); - AddToWorklist(Fabs.getNode()); - AddToWorklist(IsDenorm.getNode()); - AddToWorklist(Est.getNode()); - } else { - // X == 0.0 ? 0.0 : Est - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); - Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); - AddToWorklist(IsZero.getNode()); - AddToWorklist(Est.getNode()); - } + } + if (!Reciprocal) { + // The estimate is now completely wrong if the input was exactly 0.0 or + // possibly a denormal. Force the answer to 0.0 for those cases. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + EVT CCVT = getSetCCResultType(VT); + ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + const Function &F = DAG.getMachineFunction().getFunction(); + Attribute Denorms = F.getFnAttribute("denormal-fp-math"); + if (Denorms.getValueAsString().equals("ieee")) { + // fabs(X) < SmallestNormal ? 0.0 : Est + const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); + APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); + SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); + SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); + Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); + AddToWorklist(Fabs.getNode()); + AddToWorklist(IsDenorm.getNode()); + AddToWorklist(Est.getNode()); + } else { + // X == 0.0 ? 0.0 : Est + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); + Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); + AddToWorklist(IsZero.getNode()); + AddToWorklist(Est.getNode()); } } return Est; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5007,20 +5007,9 @@ Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } - // FIXME: This does not detect denorm inputs, so we might produce INF - // when we should produce 0.0. Try to refactor the code in DAGCombiner, - // so we don't have to duplicate it here. - if (!Reciprocal) { - EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), - VT); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); + if (!Reciprocal) Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); - // Correct the result if the operand is 0.0. - Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, - VT, Eq, Operand, Estimate); - } ExtraSteps = 0; return Estimate; Index: test/CodeGen/AArch64/sqrt-fastmath.ll =================================================================== --- test/CodeGen/AArch64/sqrt-fastmath.ll +++ test/CodeGen/AArch64/sqrt-fastmath.ll @@ -33,6 +33,9 @@ ret float %1 } +; CHECK: .LCPI1_0: +; CHECK-NEXT: .word 8388608 // float 1.17549435E-38 + define float @fsqrt_ieee_denorms(float %a) #1 { ; FAULT-LABEL: fsqrt_ieee_denorms: ; FAULT: // %bb.0: @@ -41,16 +44,20 @@ ; ; CHECK-LABEL: fsqrt_ieee_denorms: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: ldr s1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: fabs s2, s0 +; CHECK-NEXT: fcmp s2, s1 ; CHECK-NEXT: frsqrte s1, s0 ; CHECK-NEXT: fmul s2, s1, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 ; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fmul s2, s1, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 -; CHECK-NEXT: fmul s2, s2, s0 -; CHECK-NEXT: fmul s1, s1, s2 -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: fcsel s0, s0, s1, eq +; CHECK-NEXT: fmul s0, s2, s0 +; CHECK-NEXT: fmul s0, s1, s0 +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: fcsel s0, s1, s0, lt ; CHECK-NEXT: ret %1 = tail call fast float @llvm.sqrt.f32(float %a) ret float %1 @@ -71,10 +78,9 @@ ; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s ; CHECK-NEXT: fmul v2.2s, v2.2s, v0.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v2.2s -; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 -; CHECK-NEXT: bsl v1.8b, v0.8b, v2.8b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s +; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 +; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b ; CHECK-NEXT: ret %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) ret <2 x float> %1 @@ -95,10 +101,9 @@ ; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s ; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v2.4s -; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) ret <4 x float> %1 @@ -114,27 +119,25 @@ ; CHECK-LABEL: f8sqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte v2.4s, v0.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v3.4s, v0.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.4s, v1.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s -; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s -; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v3.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v1.4s, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s +; CHECK-NEXT: frsqrte v3.4s, v1.4s +; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s +; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s +; CHECK-NEXT: frsqrts v4.4s, v1.4s, v4.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s +; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s +; CHECK-NEXT: fmul v4.4s, v4.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s +; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s +; CHECK-NEXT: frsqrts v4.4s, v1.4s, v4.4s +; CHECK-NEXT: fmul v4.4s, v4.4s, v1.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; CHECK-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b +; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b ; CHECK-NEXT: ret %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a) ret <8 x float> %1 @@ -166,6 +169,9 @@ ret double %1 } +; CHECK: .LCPI6_0: +; CHECK-NEXT: .xword 4503599627370496 // double 2.2250738585072014E-308 + define double @dsqrt_ieee_denorms(double %a) #1 { ; FAULT-LABEL: dsqrt_ieee_denorms: ; FAULT: // %bb.0: @@ -174,6 +180,10 @@ ; ; CHECK-LABEL: dsqrt_ieee_denorms: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: fabs d2, d0 +; CHECK-NEXT: fcmp d2, d1 ; CHECK-NEXT: frsqrte d1, d0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 @@ -183,10 +193,10 @@ ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d2, d2, d0 -; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fcmp d0, #0.0 -; CHECK-NEXT: fcsel d0, d0, d1, eq +; CHECK-NEXT: fmul d0, d2, d0 +; CHECK-NEXT: fmul d0, d1, d0 +; CHECK-NEXT: fmov d1, xzr +; CHECK-NEXT: fcsel d0, d1, d0, lt ; CHECK-NEXT: ret %1 = tail call fast double @llvm.sqrt.f64(double %a) ret double %1 @@ -210,10 +220,9 @@ ; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d ; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v2.2d -; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d +; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) ret <2 x double> %1 @@ -229,33 +238,31 @@ ; CHECK-LABEL: d4sqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte v2.2d, v0.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v3.2d, v0.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v3.2d -; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.2d, v1.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d -; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d -; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d -; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v3.2d, v1.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fcmeq v3.2d, v1.2d, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrte v3.2d, v1.2d +; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d +; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d +; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v4.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d +; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v4.2d, v1.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d +; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; CHECK-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b +; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) ret <4 x double> %1