Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -163,6 +163,9 @@ VMULLs, // ...signed VMULLu, // ...unsigned + // Reciprocal estimates. + VRSQRTE, + UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply @@ -323,7 +326,9 @@ APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const override; - + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; bool ExpandInlineAsm(CallInst *CI) const override; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1387,6 +1387,7 @@ case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::VRSQRTE: return "ARMISD::VRSQRTE"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; @@ -11884,6 +11885,39 @@ } } +/// getEstimate - Return the appropriate estimate DAG for either the reciprocal +/// or the reciprocal square root. +static SDValue getEstimate(const ARMSubtarget &ST, + const ARMTargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, + const SDValue &Operand, unsigned &ExtraSteps) { + if (!ST.hasNEON()) + return SDValue(); + + EVT VT = Operand.getValueType(); + + std::string RecipOp; + RecipOp = "sqrt"; + RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; + + if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) + RecipOp += "f"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + ExtraSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); +} + +SDValue ARMTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { + UseOneConst = true; + return getEstimate(*Subtarget, DCI, ARMISD::VRSQRTE, Operand, ExtraSteps); +} + //===----------------------------------------------------------------------===// // ARM Inline Assembly Support //===----------------------------------------------------------------------===// Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -503,6 +503,7 @@ def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>; def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; +def NEONfrsqrte : SDNode<"ARMISD::VRSQRTE", SDTFPUnaryOp>; // Types for vector shift by immediates. The "SHX" version is for long and // narrow operations where the source and destination vectors have different @@ -5392,6 +5393,13 @@ v8f16, v8f16, int_arm_neon_vrsqrte>, Requires<[HasNEON, HasFullFP16]>; +def : Pat<(NEONfrsqrte SPR:$Vm), + (EXTRACT_SUBREG (VRSQRTEfd (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Vm, ssub_0)), ssub_0)>; +def : Pat<(v2f32 (NEONfrsqrte (v2f32 DPR:$Vm))), + (VRSQRTEfd DPR:$Vm)>; +def : Pat<(v4f32 (NEONfrsqrte (v4f32 QPR:$Vm))), + (VRSQRTEfq QPR:$Vm)>; + // VRSQRTS : Vector Reciprocal Square Root Step def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, IIC_VRECSD, "vrsqrts", "f32", Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -29,6 +29,11 @@ using namespace llvm; static cl::opt +EnableReciprocalSquareRoot("enable-reciprocal-square-root", cl::Hidden, + cl::desc("Enable Reciprocal Square Root Optimisation"), + cl::init(false)); + +static cl::opt DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden, cl::desc("Inhibit optimization of S->D register accesses on A15"), cl::init(false)); @@ -198,6 +203,17 @@ return *RM; } + // Helper function to set up the defaults for reciprocals. + static void initReciprocals(ARMTargetMachine& TM) + { + bool useRsqrt = EnableReciprocalSquareRoot; + + TM.Options.Reciprocals.setDefaults("sqrtf", useRsqrt, 2); + TM.Options.Reciprocals.setDefaults("sqrtd", useRsqrt, 4); + TM.Options.Reciprocals.setDefaults("vec-sqrtf", useRsqrt, 2); + TM.Options.Reciprocals.setDefaults("vec-sqrtd", useRsqrt, 4); + } + /// Create an ARM architecture model. /// ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, @@ -281,6 +297,7 @@ CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle) : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { + initReciprocals(*this); initAsmInfo(); if (!Subtarget.hasARMOps()) report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "