diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4277,6 +4277,15 @@ return SDValue(); } + /// Return a target-dependent comparison result if the input operand is + /// suitable for use with a square root estimate calculation. For example, the + /// comparison may check if the operand is NAN, INF, zero, normal, etc. The + /// result should be used as the condition operand for a select or branch. + virtual SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, + const DenormalMode &Mode) const { + return SDValue(); + } + //===--------------------------------------------------------------------===// // Legalization utility functions // diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22056,26 +22056,31 @@ // possibly a denormal. Force the answer to 0.0 for those cases. SDLoc DL(Op); EVT CCVT = getSetCCResultType(VT); - ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); DenormalMode DenormMode = DAG.getDenormalMode(VT); - if (DenormMode.Input == DenormalMode::IEEE) { - // This is specifically a check for the handling of denormal inputs, - // not the result. - - // fabs(X) < SmallestNormal ? 0.0 : Est - const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); - APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); - SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); - SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); - Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); - } else { - // X == 0.0 ? 0.0 : Est - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); - Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); + // Try the target specific test first. + SDValue Test = TLI.getSqrtInputTest(Op, DAG, DenormMode); + if (!Test) { + // If no test provided by target, testing it with denormal inputs to + // avoid wrong estimate. + if (DenormMode.Input == DenormalMode::IEEE) { + // This is specifically a check for the handling of denormal inputs, + // not the result. + + // Test = fabs(X) < SmallestNormal + const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); + APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); + SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); + SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); + Test = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); + } else + // Test = X == 0.0 + Test = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); } + // Test ? 0.0 : Est + Est = DAG.getNode(Test.getValueType().isVector() ? ISD::VSELECT + : ISD::SELECT, + DL, VT, Test, FPZero, Est); } } return Est; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -89,6 +89,9 @@ FRE, FRSQRTE, + /// Test instruction for software square root. + FTSQRT, + /// VPERM - The PPC VPERM Instruction. /// VPERM, @@ -1283,6 +1286,8 @@ bool Reciprocal) const override; SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override; + SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, + const DenormalMode &Mode) const override; unsigned combineRepeatedFPDivisors() const override; SDValue diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1447,6 +1447,8 @@ return "PPCISD::FP_TO_SINT_IN_VSR"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; + case PPCISD::FTSQRT: + return "PPCISD::FTSQRT"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; @@ -12758,6 +12760,33 @@ return RefinementSteps; } +SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, + const DenormalMode &Mode) const { + // TODO - add support for v2f64/v4f32 + EVT VT = Op.getValueType(); + if (VT != MVT::f64) + return SDValue(); + + SDLoc DL(Op); + // The output register of FTSQRT is CR field. + SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op); + // ftsqrt BF,FRB + // Let e_b be the unbiased exponent of the double-precision + // floating-point operand in register FRB. + // fe_flag is set to 1 if either of the following conditions occurs. + // - The double-precision floating-point operand in register FRB is a zero, + // a NaN, or an infinity, or a negative value. + // - e_b is less than or equal to -970. + // Otherwise fe_flag is set to 0. + // Both VSX and non-VSX versions would set EQ bit in the CR if the number is + // not eligible for iteration. (zero/negative/infinity/nan or unbiased + // exponent is less than -970) + SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32); + return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1, + FTSQRT, SRIdxVal), + 0); +} + SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -637,9 +637,10 @@ } class XForm_17a opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin> + InstrItinClass itin, list pattern> : XForm_17 { let FRA = 0; + let Pattern = pattern; } class XForm_18 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -74,6 +74,9 @@ SDTCisVT<0, i32>, SDTCisVT<2, OtherVT> ]>; +def SDT_PPCFtsqrt : SDTypeProfile<1, 1, [ + SDTCisVT<0, i32>]>; + def SDT_PPClbrx : SDTypeProfile<1, 2, [ SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> ]>; @@ -124,6 +127,7 @@ def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; +def PPCftsqrt : SDNode<"PPCISD::FTSQRT", SDT_PPCFtsqrt,[]>; def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>; @@ -2643,7 +2647,8 @@ def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), "ftdiv $crD, $fA, $fB", IIC_FPCompare>; def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB), - "ftsqrt $crD, $fB", IIC_FPCompare>; + "ftsqrt $crD, $fB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt f64:$fB))]>; let mayRaiseFPException = 1, hasSideEffects = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -629,7 +629,8 @@ "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>; def XSTSQRTDP : XX2Form_1<60, 106, (outs crrc:$crD), (ins vsfrc:$XB), - "xstsqrtdp $crD, $XB", IIC_FPCompare, []>; + "xstsqrtdp $crD, $XB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt f64:$XB))]>; def XVTDIVDP : XX3Form_1<60, 125, (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>; diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate.ll b/llvm/test/CodeGen/PowerPC/fma-mutate.ll --- a/llvm/test/CodeGen/PowerPC/fma-mutate.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate.ll @@ -9,12 +9,9 @@ define double @foo3_fmf(double %a) nounwind { ; CHECK-LABEL: foo3_fmf: ; CHECK: # %bb.0: -; CHECK-NEXT: xsabsdp 0, 1 -; CHECK-NEXT: addis 3, 2, .LCPI0_2@toc@ha -; CHECK-NEXT: lfd 2, .LCPI0_2@toc@l(3) -; CHECK-NEXT: xscmpudp 0, 0, 2 +; CHECK-NEXT: xstsqrtdp 0, 1 ; CHECK-NEXT: xxlxor 0, 0, 0 -; CHECK-NEXT: blt 0, .LBB0_2 +; CHECK-NEXT: bc 12, 2, .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: xsrsqrtedp 0, 1 ; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -749,11 +749,8 @@ define double @foo3_fmf(double %a) nounwind { ; CHECK-P7-LABEL: foo3_fmf: ; CHECK-P7: # %bb.0: -; CHECK-P7-NEXT: fabs 0, 1 -; CHECK-P7-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P7-NEXT: lfd 2, .LCPI20_2@toc@l(3) -; CHECK-P7-NEXT: fcmpu 0, 0, 2 -; CHECK-P7-NEXT: blt 0, .LBB20_2 +; CHECK-P7-NEXT: ftsqrt 0, 1 +; CHECK-P7-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P7-NEXT: # %bb.1: ; CHECK-P7-NEXT: frsqrte 0, 1 ; CHECK-P7-NEXT: addis 3, 2, .LCPI20_0@toc@ha @@ -770,18 +767,15 @@ ; CHECK-P7-NEXT: fmul 1, 1, 0 ; CHECK-P7-NEXT: blr ; CHECK-P7-NEXT: .LBB20_2: -; CHECK-P7-NEXT: addis 3, 2, .LCPI20_3@toc@ha -; CHECK-P7-NEXT: lfs 1, .LCPI20_3@toc@l(3) +; CHECK-P7-NEXT: addis 3, 2, .LCPI20_2@toc@ha +; CHECK-P7-NEXT: lfs 1, .LCPI20_2@toc@l(3) ; CHECK-P7-NEXT: blr ; ; CHECK-P8-LABEL: foo3_fmf: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: xsabsdp 0, 1 -; CHECK-P8-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P8-NEXT: lfd 2, .LCPI20_2@toc@l(3) -; CHECK-P8-NEXT: xscmpudp 0, 0, 2 +; CHECK-P8-NEXT: xstsqrtdp 0, 1 ; CHECK-P8-NEXT: xxlxor 0, 0, 0 -; CHECK-P8-NEXT: blt 0, .LBB20_2 +; CHECK-P8-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P8-NEXT: # %bb.1: ; CHECK-P8-NEXT: xsrsqrtedp 0, 1 ; CHECK-P8-NEXT: addis 3, 2, .LCPI20_0@toc@ha @@ -803,12 +797,9 @@ ; ; CHECK-P9-LABEL: foo3_fmf: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P9-NEXT: xsabsdp 0, 1 -; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3) -; CHECK-P9-NEXT: xscmpudp 0, 0, 2 +; CHECK-P9-NEXT: xstsqrtdp 0, 1 ; CHECK-P9-NEXT: xxlxor 0, 0, 0 -; CHECK-P9-NEXT: blt 0, .LBB20_2 +; CHECK-P9-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P9-NEXT: # %bb.1: ; CHECK-P9-NEXT: xsrsqrtedp 0, 1 ; CHECK-P9-NEXT: addis 3, 2, .LCPI20_0@toc@ha @@ -1038,18 +1029,18 @@ ; CHECK-P7-LABEL: hoo4_fmf: ; CHECK-P7: # %bb.0: ; CHECK-P7-NEXT: addis 3, 2, .LCPI26_2@toc@ha +; CHECK-P7-NEXT: ftsqrt 0, 1 ; CHECK-P7-NEXT: fmr 3, 1 -; CHECK-P7-NEXT: addis 4, 2, .LCPI26_1@toc@ha +; CHECK-P7-NEXT: addis 4, 2, .LCPI26_0@toc@ha ; CHECK-P7-NEXT: lfs 0, .LCPI26_2@toc@l(3) -; CHECK-P7-NEXT: addis 3, 2, .LCPI26_0@toc@ha -; CHECK-P7-NEXT: lfs 4, .LCPI26_1@toc@l(4) -; CHECK-P7-NEXT: lfs 5, .LCPI26_0@toc@l(3) -; CHECK-P7-NEXT: fcmpu 0, 1, 0 +; CHECK-P7-NEXT: addis 3, 2, .LCPI26_1@toc@ha +; CHECK-P7-NEXT: lfs 5, .LCPI26_0@toc@l(4) +; CHECK-P7-NEXT: lfs 4, .LCPI26_1@toc@l(3) ; CHECK-P7-NEXT: fmr 1, 0 -; CHECK-P7-NEXT: bne 0, .LBB26_3 +; CHECK-P7-NEXT: bc 4, 2, .LBB26_3 ; CHECK-P7-NEXT: # %bb.1: -; CHECK-P7-NEXT: fcmpu 0, 2, 0 -; CHECK-P7-NEXT: bne 0, .LBB26_4 +; CHECK-P7-NEXT: ftsqrt 0, 2 +; CHECK-P7-NEXT: bc 4, 2, .LBB26_4 ; CHECK-P7-NEXT: .LBB26_2: ; CHECK-P7-NEXT: fmr 2, 0 ; CHECK-P7-NEXT: blr @@ -1063,8 +1054,8 @@ ; CHECK-P7-NEXT: fmadd 1, 3, 1, 5 ; CHECK-P7-NEXT: fmul 3, 3, 4 ; CHECK-P7-NEXT: fmul 1, 3, 1 -; CHECK-P7-NEXT: fcmpu 0, 2, 0 -; CHECK-P7-NEXT: beq 0, .LBB26_2 +; CHECK-P7-NEXT: ftsqrt 0, 2 +; CHECK-P7-NEXT: bc 12, 2, .LBB26_2 ; CHECK-P7-NEXT: .LBB26_4: ; CHECK-P7-NEXT: frsqrte 0, 2 ; CHECK-P7-NEXT: fmul 3, 2, 0