diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4276,6 +4276,13 @@ return SDValue(); } + /// Return a target-dependent result if the input operand is not suitable for + /// use with a square root estimate calculation. + virtual SDValue getSqrtResultForDenormInput(SDValue Operand, + SelectionDAG &DAG) const { + return DAG.getConstantFP(0.0, SDLoc(Operand), Operand.getValueType()); + } + //===--------------------------------------------------------------------===// // Legalization utility functions // diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21949,8 +21949,6 @@ : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); if (!Reciprocal) { - // The estimate is now completely wrong if the input was exactly 0.0 or - // possibly a denormal. Force the answer to 0.0 for those cases. SDLoc DL(Op); EVT CCVT = getSetCCResultType(VT); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); @@ -21974,10 +21972,13 @@ // Test = X == 0.0 Test = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); } - // Test ? 0.0 : Est - Est = DAG.getNode(Test.getValueType().isVector() ? ISD::VSELECT - : ISD::SELECT, - DL, VT, Test, FPZero, Est); + + // The estimate is now completely wrong if the input was exactly 0.0 or + // possibly a denormal. Force the answer to 0.0 or value provided by + // target for those cases. + Est = DAG.getNode( + Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, + Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est); } } return Est; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -92,6 +92,9 @@ /// Test instruction for software square root. FTSQRT, + /// Square root instruction. + FSQRT, + /// VPERM - The PPC VPERM Instruction. /// VPERM, @@ -1292,6 +1295,8 @@ int &RefinementSteps) const override; SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, const DenormalMode &Mode) const override; + SDValue getSqrtResultForDenormInput(SDValue Operand, + SelectionDAG &DAG) const override; unsigned combineRepeatedFPDivisors() const override; SDValue diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1418,6 +1418,8 @@ case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::FTSQRT: return "PPCISD::FTSQRT"; + case PPCISD::FSQRT: + return "PPCISD::FSQRT"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; @@ -12695,6 +12697,17 @@ 0); } +SDValue +PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op, + SelectionDAG &DAG) const { + // TODO - add support for v2f64/v4f32 + EVT VT = Op.getValueType(); + if (VT != MVT::f64) + return TargetLowering::getSqrtResultForDenormInput(Op, DAG); + + return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op); +} + SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -127,6 +127,7 @@ def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; +def PPCfsqrt : SDNode<"PPCISD::FSQRT", SDTFPUnaryOp, []>; def PPCftsqrt : SDNode<"PPCISD::FTSQRT", SDT_PPCFtsqrt,[]>; def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; @@ -2704,6 +2705,8 @@ } } +def : Pat<(PPCfsqrt f64:$frA), (FSQRT $frA)>; + /// Note that FMR is defined as pseudo-ops on the PPC970 because they are /// often coalesced away and we don't want the dispatch group builder to think /// that they will fill slots (which could cause the load of a LSU reject to diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2463,6 +2463,8 @@ def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, (fneg v4f32:$C)), (XVNMADDASP $C, $A, $B)>; +def : Pat<(PPCfsqrt f64:$frA), (XSSQRTDP $frA)>; + def : Pat<(v2f64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2f64 (bitconvert v4i32:$A)), diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate.ll b/llvm/test/CodeGen/PowerPC/fma-mutate.ll --- a/llvm/test/CodeGen/PowerPC/fma-mutate.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate.ll @@ -10,7 +10,6 @@ ; CHECK-LABEL: foo3_fmf: ; CHECK: # %bb.0: ; CHECK-NEXT: xstsqrtdp 0, 1 -; CHECK-NEXT: xxlxor 0, 0, 0 ; CHECK-NEXT: bc 12, 2, .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: xsrsqrtedp 0, 1 @@ -25,9 +24,10 @@ ; CHECK-NEXT: xsmuldp 1, 1, 0 ; CHECK-NEXT: xsmaddadp 3, 1, 0 ; CHECK-NEXT: xsmuldp 0, 1, 4 -; CHECK-NEXT: xsmuldp 0, 0, 3 +; CHECK-NEXT: xsmuldp 1, 0, 3 +; CHECK-NEXT: blr ; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: fmr 1, 0 +; CHECK-NEXT: xssqrtdp 1, 1 ; CHECK-NEXT: blr %r = call reassoc afn ninf double @llvm.sqrt.f64(double %a) ret double %r diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -767,14 +767,12 @@ ; CHECK-P7-NEXT: fmul 1, 1, 0 ; CHECK-P7-NEXT: blr ; CHECK-P7-NEXT: .LBB20_2: -; CHECK-P7-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P7-NEXT: lfs 1, .LCPI20_2@toc@l(3) +; CHECK-P7-NEXT: fsqrt 1, 1 ; CHECK-P7-NEXT: blr ; ; CHECK-P8-LABEL: foo3_fmf: ; CHECK-P8: # %bb.0: ; CHECK-P8-NEXT: xstsqrtdp 0, 1 -; CHECK-P8-NEXT: xxlxor 0, 0, 0 ; CHECK-P8-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P8-NEXT: # %bb.1: ; CHECK-P8-NEXT: xsrsqrtedp 0, 1 @@ -790,15 +788,15 @@ ; CHECK-P8-NEXT: xsmuldp 1, 1, 0 ; CHECK-P8-NEXT: xsmaddadp 3, 1, 0 ; CHECK-P8-NEXT: xsmuldp 0, 1, 4 -; CHECK-P8-NEXT: xsmuldp 0, 0, 3 +; CHECK-P8-NEXT: xsmuldp 1, 0, 3 +; CHECK-P8-NEXT: blr ; CHECK-P8-NEXT: .LBB20_2: -; CHECK-P8-NEXT: fmr 1, 0 +; CHECK-P8-NEXT: xssqrtdp 1, 1 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: foo3_fmf: ; CHECK-P9: # %bb.0: ; CHECK-P9-NEXT: xstsqrtdp 0, 1 -; CHECK-P9-NEXT: xxlxor 0, 0, 0 ; CHECK-P9-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P9-NEXT: # %bb.1: ; CHECK-P9-NEXT: xsrsqrtedp 0, 1 @@ -814,9 +812,10 @@ ; CHECK-P9-NEXT: xsmuldp 1, 1, 0 ; CHECK-P9-NEXT: xsmaddadp 3, 1, 0 ; CHECK-P9-NEXT: xsmuldp 0, 1, 2 -; CHECK-P9-NEXT: xsmuldp 0, 0, 3 +; CHECK-P9-NEXT: xsmuldp 1, 0, 3 +; CHECK-P9-NEXT: blr ; CHECK-P9-NEXT: .LBB20_2: -; CHECK-P9-NEXT: fmr 1, 0 +; CHECK-P9-NEXT: xssqrtdp 1, 1 ; CHECK-P9-NEXT: blr %r = call reassoc ninf afn double @llvm.sqrt.f64(double %a) ret double %r @@ -1028,45 +1027,41 @@ define <2 x double> @hoo4_fmf(<2 x double> %a) #1 { ; CHECK-P7-LABEL: hoo4_fmf: ; CHECK-P7: # %bb.0: -; CHECK-P7-NEXT: addis 3, 2, .LCPI26_2@toc@ha ; CHECK-P7-NEXT: ftsqrt 0, 1 -; CHECK-P7-NEXT: fmr 3, 1 -; CHECK-P7-NEXT: addis 4, 2, .LCPI26_0@toc@ha -; CHECK-P7-NEXT: lfs 0, .LCPI26_2@toc@l(3) -; CHECK-P7-NEXT: addis 3, 2, .LCPI26_1@toc@ha -; CHECK-P7-NEXT: lfs 5, .LCPI26_0@toc@l(4) -; CHECK-P7-NEXT: lfs 4, .LCPI26_1@toc@l(3) -; CHECK-P7-NEXT: fmr 1, 0 -; CHECK-P7-NEXT: bc 4, 2, .LBB26_3 +; CHECK-P7-NEXT: addis 3, 2, .LCPI26_0@toc@ha +; CHECK-P7-NEXT: addis 4, 2, .LCPI26_1@toc@ha +; CHECK-P7-NEXT: lfs 3, .LCPI26_0@toc@l(3) +; CHECK-P7-NEXT: lfs 0, .LCPI26_1@toc@l(4) +; CHECK-P7-NEXT: bc 12, 2, .LBB26_3 ; CHECK-P7-NEXT: # %bb.1: +; CHECK-P7-NEXT: frsqrte 4, 1 +; CHECK-P7-NEXT: fmul 5, 1, 4 +; CHECK-P7-NEXT: fmadd 5, 5, 4, 3 +; CHECK-P7-NEXT: fmul 4, 4, 0 +; CHECK-P7-NEXT: fmul 4, 4, 5 +; CHECK-P7-NEXT: fmul 1, 1, 4 +; CHECK-P7-NEXT: fmadd 4, 1, 4, 3 +; CHECK-P7-NEXT: fmul 1, 1, 0 +; CHECK-P7-NEXT: fmul 1, 1, 4 ; CHECK-P7-NEXT: ftsqrt 0, 2 ; CHECK-P7-NEXT: bc 4, 2, .LBB26_4 ; CHECK-P7-NEXT: .LBB26_2: -; CHECK-P7-NEXT: fmr 2, 0 +; CHECK-P7-NEXT: fsqrt 2, 2 ; CHECK-P7-NEXT: blr ; CHECK-P7-NEXT: .LBB26_3: -; CHECK-P7-NEXT: frsqrte 1, 3 -; CHECK-P7-NEXT: fmul 6, 3, 1 -; CHECK-P7-NEXT: fmadd 6, 6, 1, 5 -; CHECK-P7-NEXT: fmul 1, 1, 4 -; CHECK-P7-NEXT: fmul 1, 1, 6 -; CHECK-P7-NEXT: fmul 3, 3, 1 -; CHECK-P7-NEXT: fmadd 1, 3, 1, 5 -; CHECK-P7-NEXT: fmul 3, 3, 4 -; CHECK-P7-NEXT: fmul 1, 3, 1 +; CHECK-P7-NEXT: fsqrt 1, 1 ; CHECK-P7-NEXT: ftsqrt 0, 2 ; CHECK-P7-NEXT: bc 12, 2, .LBB26_2 ; CHECK-P7-NEXT: .LBB26_4: -; CHECK-P7-NEXT: frsqrte 0, 2 -; CHECK-P7-NEXT: fmul 3, 2, 0 -; CHECK-P7-NEXT: fmadd 3, 3, 0, 5 -; CHECK-P7-NEXT: fmul 0, 0, 4 -; CHECK-P7-NEXT: fmul 0, 0, 3 -; CHECK-P7-NEXT: fmul 2, 2, 0 -; CHECK-P7-NEXT: fmadd 0, 2, 0, 5 +; CHECK-P7-NEXT: frsqrte 4, 2 +; CHECK-P7-NEXT: fmul 5, 2, 4 +; CHECK-P7-NEXT: fmadd 5, 5, 4, 3 +; CHECK-P7-NEXT: fmul 4, 4, 0 +; CHECK-P7-NEXT: fmul 4, 4, 5 ; CHECK-P7-NEXT: fmul 2, 2, 4 +; CHECK-P7-NEXT: fmadd 3, 2, 4, 3 ; CHECK-P7-NEXT: fmul 0, 2, 0 -; CHECK-P7-NEXT: fmr 2, 0 +; CHECK-P7-NEXT: fmul 2, 0, 3 ; CHECK-P7-NEXT: blr ; ; CHECK-P8-LABEL: hoo4_fmf: