Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -58,6 +58,13 @@ "Reserve X18, making it unavailable " "as a GPR">; +def FeatureApproximateSqrt : SubtargetFeature + <"approx-sqrt", "HasApproximateSqrt", "true", + "Enable the approximation of square-root", [FeatureNEON]>; +def FeatureApproximateDiv : SubtargetFeature + <"approx-div", "HasApproximateDiv", "true", + "Enable the approximation of floating-point division", [FeatureNEON]>; + //===----------------------------------------------------------------------===// // Architectures. // @@ -131,7 +138,8 @@ FeatureNEON, FeatureCrypto, FeatureCRC, - FeaturePerfMon]>; + FeaturePerfMon, + FeatureApproximateSqrt]>; def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", "Qualcomm Kryo processors", Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -187,6 +187,10 @@ SMULL, UMULL, + // Reciprocal estimates. + FRECPE, + FRSQRTE, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -511,6 +515,11 @@ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const override; + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -965,6 +965,8 @@ case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; + case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; + case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; } return nullptr; } @@ -4620,6 +4622,40 @@ // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// +/// getEstimate - Return the appropriate estimate DAG for either the reciprocal +/// or the reciprocal square root. +static SDValue getEstimate(const AArch64Subtarget &ST, + const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, + const SDValue &Operand, unsigned &ExtraSteps) { + if (!ST.hasNEON()) + return SDValue(); + + EVT VT = Operand.getValueType(); + + std::string RecipOp; + RecipOp = Opcode == AArch64ISD::FRECPE? "div": "sqrt"; + RecipOp = (VT.isVector()? "vec-": "") + RecipOp; + RecipOp += VT.getScalarType() == MVT::f64? "d": "f"; + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + ExtraSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); +} + +SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { + return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); +} + +SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { + UseOneConst = true; + return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); +} + //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -283,6 +283,9 @@ def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; +def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; + def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -3401,6 +3404,19 @@ def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))), (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))), + (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))), + (FRECPEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))), + (FRECPEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))), + (FRECPEv2f64 FPR128:$Rn)>; + def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), (FRECPXv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), @@ -3413,6 +3429,19 @@ def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))), (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))), + (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))), + (FRSQRTEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))), + (FRSQRTEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))), + (FRSQRTEv2f64 FPR128:$Rn)>; + // If an integer is about to be converted to a floating point value, // just load it on the floating point unit. // Here are the patterns for 8 and 16-bits to float. Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -58,6 +58,9 @@ bool HasFullFP16; bool HasSPE; + bool HasApproximateSqrt; + bool HasApproximateDiv; + // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove; @@ -147,6 +150,8 @@ bool hasPerfMon() const { return HasPerfMon; } bool hasFullFP16() const { return HasFullFP16; } bool hasSPE() const { return HasSPE; } + bool hasApproximateSqrt() const { return HasApproximateSqrt; } + bool hasApproximateDiv() const { return HasApproximateDiv; } bool isLittleEndian() const { return IsLittle; } Index: llvm/lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -52,6 +52,7 @@ : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), + HasApproximateSqrt(false), HasApproximateDiv(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), Index: llvm/lib/Target/AArch64/AArch64TargetMachine.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -46,7 +46,7 @@ } private: - bool isLittle; + AArch64Subtarget Subtarget; }; // AArch64leTargetMachine - AArch64 little endian target machine. Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -136,6 +136,36 @@ return "E-m:e-i64:64-i128:128-n32:64-S128"; } +// Helper function to set up the defaults for reciprocals. +static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST) +{ + // For the estimates, convergence is quadratic, so essentially the number of + // digits is doubled after each iteration. ARMv8, the minimum architected + // accuracy of the initial estimate is 2^-8. The number of extra steps + // to refine the result for float (23 mantissa bits) and for double + // (52 mantissa bits) are 2 and 3, respectively. + unsigned ExtraStepsF = 2, + ExtraStepsD = ExtraStepsF + 1; + + TM.Options.Reciprocals.setDefaults + ("sqrtf", ST.hasApproximateSqrt(), ExtraStepsF); + TM.Options.Reciprocals.setDefaults + ("sqrtd", ST.hasApproximateSqrt(), ExtraStepsD); + TM.Options.Reciprocals.setDefaults + ("vec-sqrtf", ST.hasApproximateSqrt(), ExtraStepsF); + TM.Options.Reciprocals.setDefaults + ("vec-sqrtd", ST.hasApproximateSqrt(), ExtraStepsD); + + TM.Options.Reciprocals.setDefaults + ("divf", ST.hasApproximateDiv(), ExtraStepsF); + TM.Options.Reciprocals.setDefaults + ("divd", ST.hasApproximateDiv(), ExtraStepsD); + TM.Options.Reciprocals.setDefaults + ("vec-divf", ST.hasApproximateDiv(), ExtraStepsF); + TM.Options.Reciprocals.setDefaults + ("vec-divd", ST.hasApproximateDiv(), ExtraStepsD); +} + /// TargetMachine ctor - Create an AArch64 architecture model. /// AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, @@ -149,7 +179,8 @@ : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS, Options, RM, CM, OL), TLOF(createTLOF(getTargetTriple())), - isLittle(LittleEndian) { + Subtarget(TT, CPU, FS, *this, LittleEndian) { + initReciprocals(*this, Subtarget); initAsmInfo(); } @@ -189,7 +220,7 @@ // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique(TargetTriple, CPU, FS, *this, - isLittle); + Subtarget.isLittleEndian()); #ifndef LLVM_BUILD_GLOBAL_ISEL GISelAccessor *GISel = new GISelAccessor(); #else