Index: llvm/include/llvm/Target/TargetLowering.h =================================================================== --- llvm/include/llvm/Target/TargetLowering.h +++ llvm/include/llvm/Target/TargetLowering.h @@ -25,8 +25,9 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -269,36 +270,78 @@ return false; } - /// Reciprocal estimate status values used by the functions below. - enum ReciprocalEstimate : int { - Unspecified = -1, - Disabled = 0, - Enabled = 1 + /// Default number of steps of Newton's method to perform when refining an + /// approximate (r)sqrt. Overridden by the reciprocal-estimates attr. + /// + /// There are actually two pieces of information we need to provide here: + /// + /// - Whether to emit an approximate (r)sqrt at all if reciprocal-estimates + /// says nothing about the op in question (e.g. it's empty or equal to + /// "default"). + /// + /// - The default number of steps of Newton's method to run when the + /// reciprocal-estimates attribute doesn't give an explicit number of steps + /// (e.g. it's "sqrt", rather than "sqrt:2"). + /// + /// If you want to enable approximate (r)sqrts on your target, you'll need to + /// overload both getSqrtRefinementDefaults and getRsqrtEstimate. + /// + /// The recip version of this function is analogous, for the operation 1/x. + struct RefinementDefaults { + bool EnabledByDefault; + unsigned DefaultSteps; }; + virtual RefinementDefaults getSqrtRefinementDefaults(EVT) const { + return {/*EnabledByDefault=*/false, /*DefaultSteps=*/0}; + } + virtual RefinementDefaults getRecipRefinementDefaults(EVT) const { + return {/*EnabledByDefault=*/false, /*DefaultSteps=*/0}; + } - /// Return a ReciprocalEstimate enum value for a square root of the given type - /// based on the function's attributes. If the operation is not overridden by - /// the function's attributes, "Unspecified" is returned and target defaults - /// are expected to be used for instruction selection. - int getRecipEstimateSqrtEnabled(EVT VT, MachineFunction &MF) const; + /// If true, we use a Newton-Raphson algorithm with a single constant to + /// refine sqrt estimates. Otherwise, we use an algorithm with two + /// constants. + virtual bool useOneConstNRForSqrtEstimate(EVT) const { return false; } - /// Return a ReciprocalEstimate enum value for a division of the given type - /// based on the function's attributes. If the operation is not overridden by - /// the function's attributes, "Unspecified" is returned and target defaults - /// are expected to be used for instruction selection. - int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const; + /// Get the number of refinement steps we should perform on an approximate + /// (r)sqrt operation with the given type and in the given function. + /// + /// A target has a default number of (r)sqrt refinement steps, returned by + /// getDefaultSqrtRefinementSteps(), and a function can override this with the + /// reciprocal-estimates attr. + /// + /// If this returns None, we don't use an approximate (r)sqrt here. + /// + /// The recip version of this function is analogous, for the operation 1/x. + virtual Optional + getSqrtEstimateRefinementSteps(EVT VT, const MachineFunction &MF) const; - /// Return the refinement step count for a square root of the given type based - /// on the function's attributes. If the operation is not overridden by - /// the function's attributes, "Unspecified" is returned and target defaults - /// are expected to be used for instruction selection. - int getSqrtRefinementSteps(EVT VT, MachineFunction &MF) const; + virtual Optional + getRecipEstimateRefinementSteps(EVT VT, const MachineFunction &MF) const; - /// Return the refinement step count for a division of the given type based - /// on the function's attributes. If the operation is not overridden by - /// the function's attributes, "Unspecified" is returned and target defaults - /// are expected to be used for instruction selection. - int getDivRefinementSteps(EVT VT, MachineFunction &MF) const; + /// Get a first approximate to the reciprocal square root of Operand. This + /// will be refined by getSqrtEstimateRefinementSteps() steps of Newton's + /// method. + virtual SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG) const { + return SDValue(); + } + + /// Get a first approximation of the square root of Operand. + /// + /// This is never fed into Newton's method -- for that, we always use the + /// rsqrt estimate. It's only used when getSqrtEstimateRefinementSteps() is + /// 0. If this returns an empty SDValue, callers should try to compute an + /// approximate sqrt using getRsqrtEstimate(). + virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG) const { + return SDValue(); + } + + /// Get a first approximation of the reciprocal of Operand. This will be + /// refined by getRecipEstimateRefinementSteps() iterations of Newton's + /// method. + virtual SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG) const { + return SDValue(); + } /// Returns true if target has indicated at least one type should be bypassed. bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); } @@ -3021,45 +3064,6 @@ return 0; } - /// Hooks for building estimates in place of slower divisions and square - /// roots. - - /// Return either a square root or its reciprocal estimate value for the input - /// operand. - /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or - /// 'Enabled' as set by a potential default override attribute. - /// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson - /// refinement iterations required to generate a sufficient (though not - /// necessarily IEEE-754 compliant) estimate is returned in that parameter. - /// The boolean UseOneConstNR output is used to select a Newton-Raphson - /// algorithm implementation that uses either one or two constants. - /// The boolean Reciprocal is used to select whether the estimate is for the - /// square root of the input operand or the reciprocal of its square root. - /// A target may choose to implement its own refinement within this function. - /// If that's true, then return '0' as the number of RefinementSteps to avoid - /// any further refinement of the estimate. - /// An empty SDValue return means no estimate sequence can be created. - virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, - int Enabled, int &RefinementSteps, - bool &UseOneConstNR, bool Reciprocal) const { - return SDValue(); - } - - /// Return a reciprocal estimate value for the input operand. - /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or - /// 'Enabled' as set by a potential default override attribute. - /// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson - /// refinement iterations required to generate a sufficient (though not - /// necessarily IEEE-754 compliant) estimate is returned in that parameter. - /// A target may choose to implement its own refinement within this function. - /// If that's true, then return '0' as the number of RefinementSteps to avoid - /// any further refinement of the estimate. - /// An empty SDValue return means no estimate sequence can be created. - virtual SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, - int Enabled, int &RefinementSteps) const { - return SDValue(); - } - //===--------------------------------------------------------------------===// // Legalization utility functions // Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15339,14 +15339,11 @@ // If estimates are explicitly disabled for this function, we're done. MachineFunction &MF = DAG.getMachineFunction(); - int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF); - if (Enabled == TLI.ReciprocalEstimate::Disabled) + Optional Iterations = TLI.getRecipEstimateRefinementSteps(VT, MF); + if (!Iterations) return SDValue(); - // Estimates may be explicitly enabled for this type with a custom number of - // refinement steps. - int Iterations = TLI.getDivRefinementSteps(VT, MF); - if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { + if (SDValue Est = TLI.getRecipEstimate(Op, DAG)) { AddToWorklist(Est.getNode()); if (Iterations) { @@ -15355,7 +15352,7 @@ SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); // Newton iterations: Est = Est + Est (1 - Arg * Est) - for (int i = 0; i < Iterations; ++i) { + for (unsigned i = 0; i < *Iterations; ++i) { SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags); AddToWorklist(NewEst.getNode()); @@ -15484,40 +15481,42 @@ // If estimates are explicitly disabled for this function, we're done. MachineFunction &MF = DAG.getMachineFunction(); - int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF); - if (Enabled == TLI.ReciprocalEstimate::Disabled) + Optional Iterations = TLI.getSqrtEstimateRefinementSteps(VT, MF); + if (!Iterations) return SDValue(); - // Estimates may be explicitly enabled for this type with a custom number of - // refinement steps. - int Iterations = TLI.getSqrtRefinementSteps(VT, MF); + // Special case: No iterations of Newton's method, and computing plain sqrt. + // In this case, return TLI.getSqrtEstimate if possible. If not, we'll + // compute the sqrt from the rsqrt estimate. + if (*Iterations == 0 && !Reciprocal) { + if (SDValue Est = TLI.getSqrtEstimate(Op, DAG)) { + AddToWorklist(Est.getNode()); + return Est; + } + } - bool UseOneConstNR = false; - if (SDValue Est = - TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR, - Reciprocal)) { + if (SDValue Est = TLI.getRsqrtEstimate(Op, DAG)) { AddToWorklist(Est.getNode()); - if (Iterations) { - Est = UseOneConstNR - ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) - : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); + if (*Iterations > 0) + Est = TLI.useOneConstNRForSqrtEstimate(VT) + ? buildSqrtNROneConst(Op, Est, *Iterations, Flags, Reciprocal) + : buildSqrtNRTwoConst(Op, Est, *Iterations, Flags, Reciprocal); - if (!Reciprocal) { - // Unfortunately, Est is now NaN if the input was exactly 0.0. - // Select out this case and force the answer to 0.0. - EVT VT = Op.getValueType(); - SDLoc DL(Op); + if (!Reciprocal) { + // Unfortunately, Est is now NaN if the input was exactly 0.0. + // Select out this case and force the answer to 0.0. + EVT VT = Op.getValueType(); + SDLoc DL(Op); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - EVT CCVT = getSetCCResultType(VT); - SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); - AddToWorklist(ZeroCmp.getNode()); + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + EVT CCVT = getSetCCResultType(VT); + SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); + AddToWorklist(ZeroCmp.getNode()); - Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, - ZeroCmp, FPZero, Est); - AddToWorklist(Est.getNode()); - } + Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, + ZeroCmp, FPZero, Est); + AddToWorklist(Est.getNode()); } return Est; } Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1914,182 +1914,92 @@ // Reciprocal Estimates //===----------------------------------------------------------------------===// -/// Get the reciprocal estimate attribute string for a function that will -/// override the target defaults. -static StringRef getRecipEstimateForFunc(MachineFunction &MF) { - const Function *F = MF.getFunction(); - return F->getFnAttribute("reciprocal-estimates").getValueAsString(); -} +/// Gets the number of refinement steps we should perform on an approx (r)sqrt +/// or div of the given type. +/// +/// Returns None if we shouldn't perform an approx (r)sqrt/div at all. +static Optional +getOpRefinementStepsFromAttr(bool IsSqrt, EVT VT, const MachineFunction &MF, + TargetLowering::RefinementDefaults Defaults) { + StringRef Attr = MF.getFunction() + ->getFnAttribute("reciprocal-estimates") + .getValueAsString(); -/// Construct a string for the given reciprocal operation of the given type. -/// This string should match the corresponding option to the front-end's -/// "-mrecip" flag assuming those strings have been passed through in an -/// attribute string. For example, "vec-divf" for a division of a vXf32. -static std::string getReciprocalOpName(bool IsSqrt, EVT VT) { - std::string Name = VT.isVector() ? "vec-" : ""; + // Attr is a comma-separated list of tokens. Valid tokens are of the form + // "fn", "fn:N", "!fn", or "!fn:N", where N is an integer and "fn" is one of: + // + // - none, all, default + // - div, divf, divd + // - vec-div, vec-divf, vec-divd, + // - sqrt, sqrtf, sqrtd + // - vec-sqrt, vec-sqrtf, vec-sqrtd + // - Name += IsSqrt ? "sqrt" : "div"; + SmallString<16> UntypedOpName; + UntypedOpName += VT.isVector() ? "vec-" : ""; + UntypedOpName += IsSqrt ? "sqrt" : "div"; - // TODO: Handle "half" or other float types? - if (VT.getScalarType() == MVT::f64) { - Name += "d"; - } else { - assert(VT.getScalarType() == MVT::f32 && - "Unexpected FP type for reciprocal estimate"); - Name += "f"; - } + SmallString<16> TypedOpName = UntypedOpName; + if (VT.getScalarType() == MVT::f64) + TypedOpName += "d"; + else if (VT.getScalarType() == MVT::f32) + TypedOpName += "f"; + else + llvm_unreachable("Unexpected type; not f64 or f32."); - return Name; -} + Optional Steps; + if (Defaults.EnabledByDefault) + Steps = Defaults.DefaultSteps; -/// Return the character position and value (a single numeric character) of a -/// customized refinement operation in the input string if it exists. Return -/// false if there is no customized refinement step count. -static bool parseRefinementStep(StringRef In, size_t &Position, - uint8_t &Value) { - const char RefStepToken = ':'; - Position = In.find(RefStepToken); - if (Position == StringRef::npos) - return false; + SmallVector SplitAttr; + SplitString(Attr, SplitAttr, ","); + for (StringRef Tok : SplitAttr) { + StringRef OpName, OpStepsStr; + std::tie(OpName, OpStepsStr) = Tok.split(':'); - StringRef RefStepString = In.substr(Position + 1); - // Allow exactly one numeric character for the additional refinement - // step parameter. - if (RefStepString.size() == 1) { - char RefStepChar = RefStepString[0]; - if (RefStepChar >= '0' && RefStepChar <= '9') { - Value = RefStepChar - '0'; - return true; - } - } - report_fatal_error("Invalid refinement step for -recip."); -} - -/// For the input attribute string, return one of the ReciprocalEstimate enum -/// status values (enabled, disabled, or not specified) for this operation on -/// the specified data type. -static int getOpEnabled(bool IsSqrt, EVT VT, StringRef Override) { - if (Override.empty()) - return TargetLoweringBase::ReciprocalEstimate::Unspecified; - - SmallVector OverrideVector; - SplitString(Override, OverrideVector, ","); - unsigned NumArgs = OverrideVector.size(); - - // Check if "all", "none", or "default" was specified. - if (NumArgs == 1) { - // Look for an optional setting of the number of refinement steps needed - // for this type of reciprocal operation. - size_t RefPos; - uint8_t RefSteps; - if (parseRefinementStep(Override, RefPos, RefSteps)) { - // Split the string for further processing. - Override = Override.substr(0, RefPos); + if (OpName == "none") { + Steps = None; + continue; } - // All reciprocal types are enabled. - if (Override == "all") - return TargetLoweringBase::ReciprocalEstimate::Enabled; + if (OpName.startswith("!")) { + OpName = OpName.drop_front(1); + if (OpName == UntypedOpName || OpName == TypedOpName) + Steps = None; + continue; + } - // All reciprocal types are disabled. - if (Override == "none") - return TargetLoweringBase::ReciprocalEstimate::Disabled; - - // Target defaults for enablement are used. - if (Override == "default") - return TargetLoweringBase::ReciprocalEstimate::Unspecified; - } - - // The attribute string may omit the size suffix ('f'/'d'). - std::string VTName = getReciprocalOpName(IsSqrt, VT); - std::string VTNameNoSize = VTName; - VTNameNoSize.pop_back(); - static const char DisabledPrefix = '!'; - - for (StringRef RecipType : OverrideVector) { - size_t RefPos; - uint8_t RefSteps; - if (parseRefinementStep(RecipType, RefPos, RefSteps)) - RecipType = RecipType.substr(0, RefPos); - - // Ignore the disablement token for string matching. - bool IsDisabled = RecipType[0] == DisabledPrefix; - if (IsDisabled) - RecipType = RecipType.substr(1); - - if (RecipType.equals(VTName) || RecipType.equals(VTNameNoSize)) - return IsDisabled ? TargetLoweringBase::ReciprocalEstimate::Disabled - : TargetLoweringBase::ReciprocalEstimate::Enabled; - } - - return TargetLoweringBase::ReciprocalEstimate::Unspecified; -} - -/// For the input attribute string, return the customized refinement step count -/// for this operation on the specified data type. If the step count does not -/// exist, return the ReciprocalEstimate enum value for unspecified. -static int getOpRefinementSteps(bool IsSqrt, EVT VT, StringRef Override) { - if (Override.empty()) - return TargetLoweringBase::ReciprocalEstimate::Unspecified; - - SmallVector OverrideVector; - SplitString(Override, OverrideVector, ","); - unsigned NumArgs = OverrideVector.size(); - - // Check if "all", "default", or "none" was specified. - if (NumArgs == 1) { - // Look for an optional setting of the number of refinement steps needed - // for this type of reciprocal operation. - size_t RefPos; - uint8_t RefSteps; - if (!parseRefinementStep(Override, RefPos, RefSteps)) - return TargetLoweringBase::ReciprocalEstimate::Unspecified; - - // Split the string for further processing. - Override = Override.substr(0, RefPos); - assert(Override != "none" && - "Disabled reciprocals, but specifed refinement steps?"); - - // If this is a general override, return the specified number of steps. - if (Override == "all" || Override == "default") - return RefSteps; - } - - // The attribute string may omit the size suffix ('f'/'d'). - std::string VTName = getReciprocalOpName(IsSqrt, VT); - std::string VTNameNoSize = VTName; - VTNameNoSize.pop_back(); - - for (StringRef RecipType : OverrideVector) { - size_t RefPos; - uint8_t RefSteps; - if (!parseRefinementStep(RecipType, RefPos, RefSteps)) + if (OpName != "all" && OpName != "default" && OpName != UntypedOpName && + OpName != TypedOpName) continue; - RecipType = RecipType.substr(0, RefPos); - if (RecipType.equals(VTName) || RecipType.equals(VTNameNoSize)) - return RefSteps; + Optional OpSteps; + if (!OpStepsStr.empty()) { + unsigned ParsedSteps; + if (OpStepsStr.getAsInteger(/*Radix=*/10, ParsedSteps)) + continue; + OpSteps = ParsedSteps; + } + + // Apply target's default number of steps if the token didn't contain a + // number of steps. + if (!OpSteps) + OpSteps = Defaults.DefaultSteps; + + Steps = OpSteps; } - return TargetLoweringBase::ReciprocalEstimate::Unspecified; + return Steps; } -int TargetLoweringBase::getRecipEstimateSqrtEnabled(EVT VT, - MachineFunction &MF) const { - return getOpEnabled(true, VT, getRecipEstimateForFunc(MF)); +Optional TargetLoweringBase::getSqrtEstimateRefinementSteps( + EVT VT, const MachineFunction &MF) const { + return getOpRefinementStepsFromAttr(/*IsSqrt=*/true, VT, MF, + getSqrtRefinementDefaults(VT)); } -int TargetLoweringBase::getRecipEstimateDivEnabled(EVT VT, - MachineFunction &MF) const { - return getOpEnabled(false, VT, getRecipEstimateForFunc(MF)); -} - -int TargetLoweringBase::getSqrtRefinementSteps(EVT VT, - MachineFunction &MF) const { - return getOpRefinementSteps(true, VT, getRecipEstimateForFunc(MF)); -} - -int TargetLoweringBase::getDivRefinementSteps(EVT VT, - MachineFunction &MF) const { - return getOpRefinementSteps(false, VT, getRecipEstimateForFunc(MF)); +Optional TargetLoweringBase::getRecipEstimateRefinementSteps( + EVT VT, const MachineFunction &MF) const { + return getOpRefinementStepsFromAttr(/*IsSqrt=*/false, VT, MF, + getRecipRefinementDefaults(VT)); } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -435,6 +435,11 @@ return true; } + Optional + getSqrtEstimateRefinementSteps(EVT, const MachineFunction &) const override; + Optional + getRecipEstimateRefinementSteps(EVT, const MachineFunction &) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -539,11 +544,13 @@ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const override; - SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &ExtraSteps, bool &UseOneConst, - bool Reciprocal) const override; - SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &ExtraSteps) const override; + + RefinementDefaults getSqrtRefinementDefaults(EVT) const override; + RefinementDefaults getRecipRefinementDefaults(EVT) const override; + + SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG) const override; + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG) const override; + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4675,96 +4675,174 @@ // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// -static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, - SDValue Operand, SelectionDAG &DAG, - int &ExtraSteps) { - EVT VT = Operand.getValueType(); - if (ST->hasNEON() && - (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || - VT == MVT::f32 || VT == MVT::v1f32 || - VT == MVT::v2f32 || VT == MVT::v4f32)) { - if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) - // For the reciprocal estimates, convergence is quadratic, so the number - // of digits is doubled after each iteration. In ARMv8, the accuracy of - // the initial estimate is 2^-8. Thus the number of extra steps to refine - // the result for float (23 mantissa bits) is 2 and for double (52 - // mantissa bits) is 3. - ExtraSteps = VT == MVT::f64 ? 3 : 2; +// We do our own custom (r)sqrt and recip refinement instead of relying on the +// generic logic in DAGCombine, so we have to abuse the TLI interface a bit. +// +// The return value of getSqrtEstimateRefinementSteps is essentially a bool -- +// it's always either 0 or None. This controls whether approx (r)sqrts are +// enabled at all. If they're enabled (getSqrtEstimateRefinementSteps returns +// 0), we end up in getRsqrtEstimate, which then calls +// TargetLowering::getSqrtEstimateRefinementSteps to find out how many +// refinement steps it *actually* should do. +// +// If we didn't override getSqrtEstimateRefinementSteps to return 0, then when +// presented with e.g. "reciprocal-estimates"="sqrt:3", we'd do three custom +// refinements of the sqrt here, followed by *another* three refinements in +// DAGCombine. +// +// TargetLowering::getSqrtEstimateRefinementSteps may call back into +// getDefaultSqrtRefinementSteps, if e.g. you specify +// "reciprocal-estimates"="sqrt" (meaning, "turn on reciprocal estimates for +// sqrts, using whatever number of refinements you want"). +// +// The recip versions of these functions are analogous for the op 1/x. - return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); +static bool canDoApproxSqrtOrRecipOnType(EVT VT) { + return VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || + VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || + VT == MVT::v4f32; +} + +static Optional getRefinementStepsForType(EVT VT) { + if (canDoApproxSqrtOrRecipOnType(VT)) { + // For the reciprocal estimates, convergence is quadratic, so the number of + // digits is doubled after each iteration. In ARMv8, the accuracy of the + // initial estimate is 2^-8. Thus the number of extra steps to refine the + // result for float (23 mantissa bits) is 2 and for double (52 mantissa + // bits) is 3. + // + // FIXME: Do we mean VT.getScalarType() == MVT::f64? + return VT == MVT::f64 ? 3 : 2; + } + return None; +} + +TargetLowering::RefinementDefaults +AArch64TargetLowering::getSqrtRefinementDefaults(EVT VT) const { + Optional Steps = getRefinementStepsForType(VT); + if (!Steps) { + // The value we choose for DefaultSteps doesn't matter here, because + // getRsqrtEstimate will return SDValue() if getRefinementStepsForType() + // returns None. + return {/*EnabledByDefault=*/false, /*DefaultSteps=*/0}; } - return SDValue(); + return {/*EnabledByDefault=*/Subtarget->useRSqrt(), /*DefaultSteps=*/*Steps}; +} + +TargetLowering::RefinementDefaults +AArch64TargetLowering::getRecipRefinementDefaults(EVT VT) const { + Optional Steps = getRefinementStepsForType(VT); + if (!Steps) { + // The value we choose for DefaultSteps doesn't matter here, because + // getRecipEstimate will return SDValue() if getRefinementStepsForType() + // returns None. + return {/*EnabledByDefault=*/false, /*DefaultSteps=*/0}; + } + + // Div estimates are disabled unless explicitly asked for via the + // reciprocal-estimates function attribute. + return {/*EnabledByDefault=*/false, /*DefaultSteps=*/*Steps}; +} + +Optional AArch64TargetLowering::getSqrtEstimateRefinementSteps( + EVT VT, const MachineFunction &MF) const { + Optional Steps = + TargetLowering::getSqrtEstimateRefinementSteps(VT, MF); + if (Steps) + return 0; + return None; +} + +Optional AArch64TargetLowering::getRecipEstimateRefinementSteps( + EVT VT, const MachineFunction &MF) const { + Optional Steps = + TargetLowering::getRecipEstimateRefinementSteps(VT, MF); + if (Steps) + return 0; + return None; +} + +SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, + SelectionDAG &DAG) const { + EVT VT = Operand.getValueType(); + if (!Subtarget->hasNEON() || !canDoApproxSqrtOrRecipOnType(VT)) + return SDValue(); + + Optional RefinementSteps = + Subtarget->getTargetLowering() + ->TargetLowering::getSqrtEstimateRefinementSteps( + VT, DAG.getMachineFunction()); + if (!RefinementSteps) + return SDValue(); + + SDLoc DL(Operand); + SDValue Estimate = DAG.getNode(AArch64ISD::FRSQRTE, DL, VT, Operand); + + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); + + // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) + // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) + for (int i = *RefinementSteps; i > 0; --i) { + SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, &Flags); + Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + } + return Estimate; } SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, - SelectionDAG &DAG, int Enabled, - int &ExtraSteps, - bool &UseOneConst, - bool Reciprocal) const { - if (Enabled == ReciprocalEstimate::Enabled || - (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) - if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, - DAG, ExtraSteps)) { - SDLoc DL(Operand); - EVT VT = Operand.getValueType(); + SelectionDAG &DAG) const { + SDValue Estimate = getRsqrtEstimate(Operand, DAG); + if (!Estimate) + return SDValue(); - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); - // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) - // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) - for (int i = ExtraSteps; i > 0; --i) { - SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, - &Flags); - Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); - } + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); - if (!Reciprocal) { - EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), - VT); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags); - // Correct the result if the operand is 0.0. - Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, - VT, Eq, Operand, Estimate); - } - - ExtraSteps = 0; - return Estimate; - } - - return SDValue(); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags); + // Correct the result if the operand is 0.0. + Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, Eq, + Operand, Estimate); + return Estimate; } SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, - SelectionDAG &DAG, int Enabled, - int &ExtraSteps) const { - if (Enabled == ReciprocalEstimate::Enabled) - if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, - DAG, ExtraSteps)) { - SDLoc DL(Operand); - EVT VT = Operand.getValueType(); + SelectionDAG &DAG) const { + EVT VT = Operand.getValueType(); + if (!Subtarget->hasNEON() || !canDoApproxSqrtOrRecipOnType(VT)) + return SDValue(); - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); + Optional RefinementSteps = + Subtarget->getTargetLowering() + ->TargetLowering::getRecipEstimateRefinementSteps( + VT, DAG.getMachineFunction()); + if (!RefinementSteps) + return SDValue(); - // Newton reciprocal iteration: E * (2 - X * E) - // AArch64 reciprocal iteration instruction: (2 - M * N) - for (int i = ExtraSteps; i > 0; --i) { - SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, - Estimate, &Flags); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); - } + SDLoc DL(Operand); + SDValue Estimate = DAG.getNode(AArch64ISD::FRECPE, DL, VT, Operand); - ExtraSteps = 0; - return Estimate; - } + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); - return SDValue(); + // Newton reciprocal iteration: E * (2 - X * E) + // AArch64 reciprocal iteration instruction: (2 - M * N) + for (int i = *RefinementSteps; i > 0; --i) { + SDValue Step = + DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, Estimate, &Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + } + + return Estimate; } //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -183,11 +183,11 @@ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { return true; } - SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &RefinementSteps, bool &UseOneConstNR, - bool Reciprocal) const override; - SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &RefinementSteps) const override; + + RefinementDefaults getSqrtRefinementDefaults(EVT) const override; + RefinementDefaults getRecipRefinementDefaults(EVT) const override; + SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG) const override; + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG) const override; virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const = 0; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3310,17 +3310,21 @@ return nullptr; } -SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, - SelectionDAG &DAG, int Enabled, - int &RefinementSteps, - bool &UseOneConstNR, - bool Reciprocal) const { - EVT VT = Operand.getValueType(); +TargetLowering::RefinementDefaults +AMDGPUTargetLowering::getSqrtRefinementDefaults(EVT VT) const { + return {/*EnabledByDefault=*/VT == MVT::f32, /*DefaultSteps=*/0}; +} - if (VT == MVT::f32) { - RefinementSteps = 0; +TargetLowering::RefinementDefaults +AMDGPUTargetLowering::getRecipRefinementDefaults(EVT VT) const { + return {/*EnabledByDefault=*/VT == MVT::f32, /*DefaultSteps=*/0}; +} + +SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, + SelectionDAG &DAG) const { + EVT VT = Operand.getValueType(); + if (VT == MVT::f32) return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); - } // TODO: There is also f64 rsq instruction, but the documentation is less // clear on its precision. @@ -3329,17 +3333,13 @@ } SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, - SelectionDAG &DAG, int Enabled, - int &RefinementSteps) const { + SelectionDAG &DAG) const { EVT VT = Operand.getValueType(); - if (VT == MVT::f32) { // Reciprocal, < 1 ulp error. // // This reciprocal approximation converges to < 0.5 ulp error with one // newton rhapson performed with two fused multiple adds (FMAs). - - RefinementSteps = 0; return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); } Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -526,9 +526,9 @@ // to sign-preserving zero. bool useF32FTZ(const MachineFunction &MF) const; - SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &ExtraSteps, bool &UseOneConst, - bool Reciprocal) const override; + RefinementDefaults getSqrtRefinementDefaults(EVT) const override; + SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG) const override; + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG) const override; bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const; bool allowUnsafeFPMath(MachineFunction &MF) const; Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1043,17 +1043,13 @@ return TargetLoweringBase::getPreferredVectorAction(VT); } -SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, - int Enabled, int &ExtraSteps, - bool &UseOneConst, - bool Reciprocal) const { - if (!(Enabled == ReciprocalEstimate::Enabled || - (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) - return SDValue(); - - if (ExtraSteps == ReciprocalEstimate::Unspecified) - ExtraSteps = 0; +TargetLowering::RefinementDefaults +NVPTXTargetLowering::getSqrtRefinementDefaults(EVT) const { + return {/*EnabledByDefault=*/!usePrecSqrtF32(), /*DefaultSteps=*/0}; +} +SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, + SelectionDAG &DAG) const { SDLoc DL(Operand); EVT VT = Operand.getValueType(); bool Ftz = useF32FTZ(DAG.getMachineFunction()); @@ -1063,33 +1059,41 @@ DAG.getConstant(IID, DL, MVT::i32), Operand); }; - // The sqrt and rsqrt refinement processes assume we always start out with an - // approximation of the rsqrt. Therefore, if we're going to do any refinement - // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing - // any refinement, we must return a regular sqrt. - if (Reciprocal || ExtraSteps > 0) { - if (VT == MVT::f32) - return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f - : Intrinsic::nvvm_rsqrt_approx_f); - else if (VT == MVT::f64) - return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); - else - return SDValue(); - } else { - if (VT == MVT::f32) - return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f - : Intrinsic::nvvm_sqrt_approx_f); - else { - // There's no sqrt.approx.f64 instruction, so we emit - // reciprocal(rsqrt(x)). This is faster than - // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain - // x * rsqrt(x).) - return DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), - MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); - } + if (VT == MVT::f32) + return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f + : Intrinsic::nvvm_sqrt_approx_f); + + if (VT == MVT::f64) { + // There's no sqrt.approx.f64 instruction, so we emit + // reciprocal(rsqrt(x)). This is faster than + // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain + // x * rsqrt(x).) + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), + MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); } + + return SDValue(); +} + +SDValue NVPTXTargetLowering::getRsqrtEstimate(SDValue Operand, + SelectionDAG &DAG) const { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + bool Ftz = useF32FTZ(DAG.getMachineFunction()); + + Intrinsic::ID IID; + if (VT == MVT::f32) + IID = Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f + : Intrinsic::nvvm_rsqrt_approx_f; + else if (VT == MVT::f64) + IID = Intrinsic::nvvm_rsqrt_approx_d; + else + return SDValue(); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(IID, DL, MVT::i32), Operand); } SDValue Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -997,11 +997,13 @@ /// (2) keeping the result of comparison in GPR has performance benefit. SDValue ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &RefinementSteps, bool &UseOneConstNR, - bool Reciprocal) const override; - SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &RefinementSteps) const override; + RefinementDefaults getSqrtRefinementDefaults(EVT) const override; + RefinementDefaults getRecipRefinementDefaults(EVT) const override; + bool useOneConstNRForSqrtEstimate(EVT) const override { return true; } + + SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG) const override; + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG) const override; + unsigned combineRepeatedFPDivisors() const override; CCAssignFn *useFastISelCCs(unsigned Flag) const; Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9857,7 +9857,8 @@ // Target Optimization Hooks //===----------------------------------------------------------------------===// -static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { +static unsigned getEstimateRefinementSteps(EVT VT, + const PPCSubtarget &Subtarget) { // For the estimates, convergence is quadratic, so we essentially double the // number of digits correct after every iteration. For both FRE and FRSQRTE, // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), @@ -9868,40 +9869,41 @@ return RefinementSteps; } -SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, - int Enabled, int &RefinementSteps, - bool &UseOneConstNR, - bool Reciprocal) const { +TargetLowering::RefinementDefaults +PPCTargetLowering::getSqrtRefinementDefaults(EVT VT) const { + return {/*EnabledByDefault=*/true, getEstimateRefinementSteps(VT, Subtarget)}; +} + +TargetLowering::RefinementDefaults +PPCTargetLowering::getRecipRefinementDefaults(EVT VT) const { + return {/*EnabledByDefault=*/true, getEstimateRefinementSteps(VT, Subtarget)}; +} + +SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, + SelectionDAG &DAG) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { - if (RefinementSteps == ReciprocalEstimate::Unspecified) - RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); - - UseOneConstNR = true; + (VT == MVT::v4f64 && Subtarget.hasQPX())) return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); - } + return SDValue(); } -SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, - int Enabled, - int &RefinementSteps) const { +SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, + SelectionDAG &DAG) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { - if (RefinementSteps == ReciprocalEstimate::Unspecified) - RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); + (VT == MVT::v4f64 && Subtarget.hasQPX())) return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); - } + return SDValue(); } Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1277,13 +1277,13 @@ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override; /// Use rsqrt* to speed up sqrt calculations. - SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &RefinementSteps, bool &UseOneConstNR, - bool Reciprocal) const override; + bool useOneConstNRForSqrtEstimate(EVT) const override { return false; } + RefinementDefaults getSqrtRefinementDefaults(EVT) const override; + SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG) const override; /// Use rcp* to speed up fdiv calculations. - SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, - int &RefinementSteps) const override; + RefinementDefaults getRecipRefinementDefaults(EVT) const override; + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG) const override; /// Reassociate floating point divisions into multiply by reciprocal. unsigned combineRepeatedFPDivisors() const override; Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16399,13 +16399,15 @@ return Subtarget.hasFastScalarFSQRT(); } +TargetLowering::RefinementDefaults +X86TargetLowering::getSqrtRefinementDefaults(EVT VT) const { + return {/*EnabledByDefault=*/true, /*DefaultSteps=*/1}; +} + /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). -SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, - SelectionDAG &DAG, int Enabled, - int &RefinementSteps, - bool &UseOneConstNR, - bool Reciprocal) const { +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. @@ -16417,23 +16419,24 @@ // along with FMA, this could be a throughput win. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1()) || - (VT == MVT::v8f32 && Subtarget.hasAVX())) { - if (RefinementSteps == ReciprocalEstimate::Unspecified) - RefinementSteps = 1; - - UseOneConstNR = false; + (VT == MVT::v8f32 && Subtarget.hasAVX())) return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); - } + return SDValue(); } +TargetLowering::RefinementDefaults +X86TargetLowering::getRecipRefinementDefaults(EVT VT) const { + // Enable estimate codegen with 1 refinement step for vector division. + // Scalar division estimates are disabled because they break too much + // real-world code. These defaults are intended to match GCC behavior. + return {/*EnabledByDefault=*/VT.isVector(), /*DefaultSteps=*/1}; +} + /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). -SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, - int Enabled, - int &RefinementSteps) const { - EVT VT = Op.getValueType(); - +SDValue X86TargetLowering::getRecipEstimate(SDValue Op, + SelectionDAG &DAG) const { // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -16442,20 +16445,12 @@ // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. + EVT VT = Op.getValueType(); if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1()) || - (VT == MVT::v8f32 && Subtarget.hasAVX())) { - // Enable estimate codegen with 1 refinement step for vector division. - // Scalar division estimates are disabled because they break too much - // real-world code. These defaults are intended to match GCC behavior. - if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) - return SDValue(); - - if (RefinementSteps == ReciprocalEstimate::Unspecified) - RefinementSteps = 1; - + (VT == MVT::v8f32 && Subtarget.hasAVX())) return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); - } + return SDValue(); }