diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4206,6 +4206,13 @@ return SDValue(); } + /// Return a target-dependent result if the input operand is not suitable for + /// use with a square root estimate calculation. + virtual SDValue getSqrtResultForDenormInput(SDValue Operand, + SelectionDAG &DAG) const { + return SDValue(); + } + //===--------------------------------------------------------------------===// // Legalization utility functions // diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21375,8 +21375,6 @@ : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); if (!Reciprocal) { - // The estimate is now completely wrong if the input was exactly 0.0 or - // possibly a denormal. Force the answer to 0.0 for those cases. SDLoc DL(Op); EVT CCVT = getSetCCResultType(VT); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); @@ -21400,9 +21398,17 @@ // Test = X == 0.0 Test = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); } - // Test ? 0.0 : Est + + // The estimate is now completely wrong if the input was exactly 0.0 or + // possibly a denormal. Force the answer to 0.0 or value that provided + // by target for those cases. + SDValue ResultForDenorm = TLI.getSqrtResultForDenormInput(Op, DAG); + if (!ResultForDenorm) + ResultForDenorm = FPZero; + + // Test ? ResultForDenorm : Est Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, - Test, FPZero, Est); + Test, ResultForDenorm, Est); } } return Est; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -92,6 +92,9 @@ /// Test instruction for software square root. FTSQRT, + /// Square root instruction. + FSQRT, + // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking // three v4f32 operands and producing a v4f32 result. VMADDFP, @@ -1229,6 +1232,8 @@ int &RefinementSteps) const override; SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, const DenormalMode &Mode) const override; + SDValue getSqrtResultForDenormInput(SDValue Operand, + SelectionDAG &DAG) const override; unsigned combineRepeatedFPDivisors() const override; SDValue diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1435,6 +1435,8 @@ case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::FTSQRT: return "PPCISD::FTSQRT"; + case PPCISD::FSQRT: + return "PPCISD::FSQRT"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; @@ -12403,6 +12405,16 @@ 0); } +SDValue +PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + if (VT != MVT::f64) + return SDValue(); + + return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op); +} + SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -123,6 +123,7 @@ def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; +def PPCfsqrt : SDNode<"PPCISD::FSQRT", SDTFPUnaryOp, []>; def PPCftsqrt : SDNode<"PPCISD::FTSQRT", SDT_PPCFtsqrt,[]>; def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; @@ -2602,6 +2603,8 @@ } } +def : Pat<(PPCfsqrt f64:$frA), (FSQRT $frA)>; + /// Note that FMR is defined as pseudo-ops on the PPC970 because they are /// often coalesced away and we don't want the dispatch group builder to think /// that they will fill slots (which could cause the load of a LSU reject to diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2433,6 +2433,8 @@ def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C), (XVNMSUBASP $C, $A, $B)>; +def : Pat<(PPCfsqrt f64:$frA), (XSSQRTDP $frA)>; + def : Pat<(v2f64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2f64 (bitconvert v4i32:$A)), diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate.ll b/llvm/test/CodeGen/PowerPC/fma-mutate.ll --- a/llvm/test/CodeGen/PowerPC/fma-mutate.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate.ll @@ -10,7 +10,6 @@ ; CHECK-LABEL: foo3_fmf: ; CHECK: # %bb.0: ; CHECK-NEXT: xstsqrtdp 0, 1 -; CHECK-NEXT: xxlxor 0, 0, 0 ; CHECK-NEXT: bc 12, 2, .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: xsrsqrtedp 0, 1 @@ -25,9 +24,10 @@ ; CHECK-NEXT: xsmuldp 1, 1, 0 ; CHECK-NEXT: xsmaddadp 3, 1, 0 ; CHECK-NEXT: xsmuldp 0, 1, 4 -; CHECK-NEXT: xsmuldp 0, 0, 3 +; CHECK-NEXT: xsmuldp 1, 0, 3 +; CHECK-NEXT: blr ; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: fmr 1, 0 +; CHECK-NEXT: xssqrtdp 1, 1 ; CHECK-NEXT: blr %r = call reassoc afn ninf double @llvm.sqrt.f64(double %a) ret double %r diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -770,14 +770,12 @@ ; CHECK-P7-NEXT: fmul 1, 1, 0 ; CHECK-P7-NEXT: blr ; CHECK-P7-NEXT: .LBB20_2: -; CHECK-P7-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P7-NEXT: lfs 1, .LCPI20_2@toc@l(3) +; CHECK-P7-NEXT: fsqrt 1, 1 ; CHECK-P7-NEXT: blr ; ; CHECK-P8-LABEL: foo3_fmf: ; CHECK-P8: # %bb.0: ; CHECK-P8-NEXT: xstsqrtdp 0, 1 -; CHECK-P8-NEXT: xxlxor 0, 0, 0 ; CHECK-P8-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P8-NEXT: # %bb.1: ; CHECK-P8-NEXT: xsrsqrtedp 0, 1 @@ -793,15 +791,15 @@ ; CHECK-P8-NEXT: xsmuldp 1, 1, 0 ; CHECK-P8-NEXT: xsmaddadp 3, 1, 0 ; CHECK-P8-NEXT: xsmuldp 0, 1, 4 -; CHECK-P8-NEXT: xsmuldp 0, 0, 3 +; CHECK-P8-NEXT: xsmuldp 1, 0, 3 +; CHECK-P8-NEXT: blr ; CHECK-P8-NEXT: .LBB20_2: -; CHECK-P8-NEXT: fmr 1, 0 +; CHECK-P8-NEXT: xssqrtdp 1, 1 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: foo3_fmf: ; CHECK-P9: # %bb.0: ; CHECK-P9-NEXT: xstsqrtdp 0, 1 -; CHECK-P9-NEXT: xxlxor 0, 0, 0 ; CHECK-P9-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P9-NEXT: # %bb.1: ; CHECK-P9-NEXT: xsrsqrtedp 0, 1 @@ -817,9 +815,10 @@ ; CHECK-P9-NEXT: xsmuldp 1, 1, 0 ; CHECK-P9-NEXT: xsmaddadp 3, 1, 0 ; CHECK-P9-NEXT: xsmuldp 0, 1, 2 -; CHECK-P9-NEXT: xsmuldp 0, 0, 3 +; CHECK-P9-NEXT: xsmuldp 1, 0, 3 +; CHECK-P9-NEXT: blr ; CHECK-P9-NEXT: .LBB20_2: -; CHECK-P9-NEXT: fmr 1, 0 +; CHECK-P9-NEXT: xssqrtdp 1, 1 ; CHECK-P9-NEXT: blr %r = call reassoc ninf afn double @llvm.sqrt.f64(double %a) ret double %r @@ -1031,45 +1030,41 @@ define <2 x double> @hoo4_fmf(<2 x double> %a) #1 { ; CHECK-P7-LABEL: hoo4_fmf: ; CHECK-P7: # %bb.0: -; CHECK-P7-NEXT: addis 3, 2, .LCPI26_2@toc@ha ; CHECK-P7-NEXT: ftsqrt 0, 1 -; CHECK-P7-NEXT: fmr 3, 1 -; CHECK-P7-NEXT: addis 4, 2, .LCPI26_0@toc@ha -; CHECK-P7-NEXT: lfs 0, .LCPI26_2@toc@l(3) -; CHECK-P7-NEXT: addis 3, 2, .LCPI26_1@toc@ha -; CHECK-P7-NEXT: lfs 5, .LCPI26_0@toc@l(4) -; CHECK-P7-NEXT: lfs 4, .LCPI26_1@toc@l(3) -; CHECK-P7-NEXT: fmr 1, 0 -; CHECK-P7-NEXT: bc 4, 2, .LBB26_3 +; CHECK-P7-NEXT: addis 3, 2, .LCPI26_0@toc@ha +; CHECK-P7-NEXT: addis 4, 2, .LCPI26_1@toc@ha +; CHECK-P7-NEXT: lfs 3, .LCPI26_0@toc@l(3) +; CHECK-P7-NEXT: lfs 0, .LCPI26_1@toc@l(4) +; CHECK-P7-NEXT: bc 12, 2, .LBB26_3 ; CHECK-P7-NEXT: # %bb.1: +; CHECK-P7-NEXT: frsqrte 4, 1 +; CHECK-P7-NEXT: fmul 5, 1, 4 +; CHECK-P7-NEXT: fmadd 5, 5, 4, 3 +; CHECK-P7-NEXT: fmul 4, 4, 0 +; CHECK-P7-NEXT: fmul 4, 4, 5 +; CHECK-P7-NEXT: fmul 1, 1, 4 +; CHECK-P7-NEXT: fmadd 4, 1, 4, 3 +; CHECK-P7-NEXT: fmul 1, 1, 0 +; CHECK-P7-NEXT: fmul 1, 1, 4 ; CHECK-P7-NEXT: ftsqrt 0, 2 ; CHECK-P7-NEXT: bc 4, 2, .LBB26_4 ; CHECK-P7-NEXT: .LBB26_2: -; CHECK-P7-NEXT: fmr 2, 0 +; CHECK-P7-NEXT: fsqrt 2, 2 ; CHECK-P7-NEXT: blr ; CHECK-P7-NEXT: .LBB26_3: -; CHECK-P7-NEXT: frsqrte 1, 3 -; CHECK-P7-NEXT: fmul 6, 3, 1 -; CHECK-P7-NEXT: fmadd 6, 6, 1, 5 -; CHECK-P7-NEXT: fmul 1, 1, 4 -; CHECK-P7-NEXT: fmul 1, 1, 6 -; CHECK-P7-NEXT: fmul 3, 3, 1 -; CHECK-P7-NEXT: fmadd 1, 3, 1, 5 -; CHECK-P7-NEXT: fmul 3, 3, 4 -; CHECK-P7-NEXT: fmul 1, 3, 1 +; CHECK-P7-NEXT: fsqrt 1, 1 ; CHECK-P7-NEXT: ftsqrt 0, 2 ; CHECK-P7-NEXT: bc 12, 2, .LBB26_2 ; CHECK-P7-NEXT: .LBB26_4: -; CHECK-P7-NEXT: frsqrte 0, 2 -; CHECK-P7-NEXT: fmul 3, 2, 0 -; CHECK-P7-NEXT: fmadd 3, 3, 0, 5 -; CHECK-P7-NEXT: fmul 0, 0, 4 -; CHECK-P7-NEXT: fmul 0, 0, 3 -; CHECK-P7-NEXT: fmul 2, 2, 0 -; CHECK-P7-NEXT: fmadd 0, 2, 0, 5 +; CHECK-P7-NEXT: frsqrte 4, 2 +; CHECK-P7-NEXT: fmul 5, 2, 4 +; CHECK-P7-NEXT: fmadd 5, 5, 4, 3 +; CHECK-P7-NEXT: fmul 4, 4, 0 +; CHECK-P7-NEXT: fmul 4, 4, 5 ; CHECK-P7-NEXT: fmul 2, 2, 4 +; CHECK-P7-NEXT: fmadd 3, 2, 4, 3 ; CHECK-P7-NEXT: fmul 0, 2, 0 -; CHECK-P7-NEXT: fmr 2, 0 +; CHECK-P7-NEXT: fmul 2, 0, 3 ; CHECK-P7-NEXT: blr ; ; CHECK-P8-LABEL: hoo4_fmf: