Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -163,6 +163,9 @@
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      // Reciprocal estimates.
+      VRSQRTE,
+
       UMLAL,        // 64bit Unsigned Accumulate Multiply
       SMLAL,        // 64bit Signed Accumulate Multiply
       UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
@@ -323,7 +326,9 @@
                                        APInt &KnownOne,
                                        const SelectionDAG &DAG,
                                        unsigned Depth) const override;
-
+  SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps,
+                           bool &UseOneConstNR) const override;
 
     bool ExpandInlineAsm(CallInst *CI) const override;
 
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -1387,6 +1387,7 @@
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::VRSQRTE:       return "ARMISD::VRSQRTE";
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
@@ -11884,6 +11885,39 @@
   }
 }
 
+/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
+/// or the reciprocal square root.
+static SDValue getEstimate(const ARMSubtarget &ST,
+  const ARMTargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
+  const SDValue &Operand, unsigned &ExtraSteps) {
+  if (!ST.hasNEON())
+    return SDValue();
+
+  EVT VT = Operand.getValueType();
+
+  std::string RecipOp;
+  RecipOp = "sqrt";
+  RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
+
+ if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32)
+     RecipOp += "f";
+ else
+    return SDValue();
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  ExtraSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+}
+
+SDValue ARMTargetLowering::getRsqrtEstimate(SDValue Operand,
+  DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
+  UseOneConst = true;
+  return getEstimate(*Subtarget, DCI, ARMISD::VRSQRTE, Operand, ExtraSteps);
+}
+
 //===----------------------------------------------------------------------===//
 //                           ARM Inline Assembly Support
 //===----------------------------------------------------------------------===//
Index: lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- lib/Target/ARM/ARMInstrNEON.td
+++ lib/Target/ARM/ARMInstrNEON.td
@@ -503,6 +503,7 @@
 def NEONvcltz     : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
 def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
 def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+def NEONfrsqrte   : SDNode<"ARMISD::VRSQRTE", SDTFPUnaryOp>;
 
 // Types for vector shift by immediates.  The "SHX" version is for long and
 // narrow operations where the source and destination vectors have different
@@ -5392,6 +5393,13 @@
                          v8f16, v8f16, int_arm_neon_vrsqrte>,
                 Requires<[HasNEON, HasFullFP16]>;
 
+def   : Pat<(NEONfrsqrte SPR:$Vm),
+            (EXTRACT_SUBREG (VRSQRTEfd (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Vm, ssub_0)), ssub_0)>;
+def   : Pat<(v2f32 (NEONfrsqrte (v2f32 DPR:$Vm))),
+             (VRSQRTEfd  DPR:$Vm)>;
+def   : Pat<(v4f32 (NEONfrsqrte (v4f32 QPR:$Vm))),
+             (VRSQRTEfq  QPR:$Vm)>;
+
 //   VRSQRTS  : Vector Reciprocal Square Root Step
 def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
                         IIC_VRECSD, "vrsqrts", "f32",
Index: lib/Target/ARM/ARMTargetMachine.cpp
===================================================================
--- lib/Target/ARM/ARMTargetMachine.cpp
+++ lib/Target/ARM/ARMTargetMachine.cpp
@@ -29,6 +29,11 @@
 using namespace llvm;
 
 static cl::opt<bool>
+EnableReciprocalSquareRoot("enable-reciprocal-square-root", cl::Hidden,
+                   cl::desc("Enable Reciprocal Square Root Optimisation"),
+                   cl::init(false));
+
+static cl::opt<bool>
 DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
                    cl::desc("Inhibit optimization of S->D register accesses on A15"),
                    cl::init(false));
@@ -198,6 +203,17 @@
   return *RM;
 }
 
+ // Helper function to set up the defaults for reciprocals.
+ static void initReciprocals(ARMTargetMachine& TM)
+ {
+   bool useRsqrt = EnableReciprocalSquareRoot;
+ 
+   TM.Options.Reciprocals.setDefaults("sqrtf", useRsqrt, 2);
+   TM.Options.Reciprocals.setDefaults("sqrtd", useRsqrt, 4);
+   TM.Options.Reciprocals.setDefaults("vec-sqrtf", useRsqrt, 2);
+   TM.Options.Reciprocals.setDefaults("vec-sqrtd", useRsqrt, 4);
+ }
+
 /// Create an ARM architecture model.
 ///
 ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
@@ -281,6 +297,7 @@
                                    CodeModel::Model CM, CodeGenOpt::Level OL,
                                    bool isLittle)
     : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
+  initReciprocals(*this);
   initAsmInfo();
   if (!Subtarget.hasARMOps())
     report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "