Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17454,19 +17454,34 @@ : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); if (!Reciprocal) { - // Unfortunately, Est is now NaN if the input was exactly 0.0. - // Select out this case and force the answer to 0.0. + // The estimate is now completely wrong if the input was exactly 0.0 or + // possibly a denormal. Force the answer to 0.0 for those cases. EVT VT = Op.getValueType(); SDLoc DL(Op); - - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); EVT CCVT = getSetCCResultType(VT); - SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); - AddToWorklist(ZeroCmp.getNode()); - - Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, - ZeroCmp, FPZero, Est); - AddToWorklist(Est.getNode()); + ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + const Function &F = DAG.getMachineFunction().getFunction(); + Attribute Denorms = F.getFnAttribute("denormal-fp-math"); + if (Denorms.getValueAsString().equals("ieee")) { + // fabs(X) < SmallestNormal ? 0.0 : Est + const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); + APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); + SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); + SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); + Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); + AddToWorklist(Fabs.getNode()); + AddToWorklist(IsDenorm.getNode()); + AddToWorklist(Est.getNode()); + } else { + // X == 0.0 ? 0.0 : Est + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); + Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); + AddToWorklist(IsZero.getNode()); + AddToWorklist(Est.getNode()); + } } } return Est; Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5007,7 +5007,9 @@ Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } - + // FIXME: This does not detect denorm inputs, so we might produce INF + // when we should produce 0.0. Try to refactor the code in DAGCombiner, + // so we don't have to duplicate it here. if (!Reciprocal) { EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); Index: llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll +++ llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll @@ -121,8 +121,8 @@ ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2 ; SSE-NEXT: mulss %xmm3, %xmm2 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cmpeqss %xmm1, %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -134,8 +134,8 @@ ; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %call = tail call float @__sqrtf_finite(float %x) #2 @@ -145,17 +145,19 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ; SSE-LABEL: sqrt_v4f32_check_denorms: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: rsqrtps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01] -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cmpneqps %xmm1, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: addps {{.*}}(%rip), %xmm1 +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38] +; SSE-NEXT: cmpleps %xmm0, %xmm2 +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sqrt_v4f32_check_denorms: @@ -166,8 +168,9 @@ ; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38] +; AVX-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2