Index: llvm/include/llvm/Target/TargetLowering.h
===================================================================
--- llvm/include/llvm/Target/TargetLowering.h
+++ llvm/include/llvm/Target/TargetLowering.h
@@ -2993,21 +2993,24 @@
   /// Hooks for building estimates in place of slower divisions and square
   /// roots.
 
-  /// Return a reciprocal square root estimate value for the input operand.
+  /// Return either a square root or its reciprocal estimate value for the input
+  /// operand.
   /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
   /// 'Enabled' as set by a potential default override attribute.
   /// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
   /// refinement iterations required to generate a sufficient (though not
   /// necessarily IEEE-754 compliant) estimate is returned in that parameter.
   /// The boolean UseOneConstNR output is used to select a Newton-Raphson
-  /// algorithm implementation that uses one constant or two constants.
+  /// algorithm implementation that uses either one or two constants.
+  /// The boolean Reciprocal is used to select whether the estimate is for the
+  /// square root of the input operand or the reciprocal of its square root.
   /// A target may choose to implement its own refinement within this function.
   /// If that's true, then return '0' as the number of RefinementSteps to avoid
   /// any further refinement of the estimate.
   /// An empty SDValue return means no estimate sequence can be created.
-  virtual SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG,
-                                   int Enabled, int &RefinementSteps,
-                                   bool &UseOneConstNR) const {
+  virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+                                  int Enabled, int &RefinementSteps,
+                                  bool &UseOneConstNR, bool Reciprocal) const {
     return SDValue();
   }
 
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14858,6 +14858,12 @@
   return S;
 }
 
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal, we need to find the zero of the function:
+///   F(X) = A X - 1 [which has a zero at X = 1/A]
+///     =>
+///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+///     does not require additional intermediate precision]
 SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
@@ -14877,19 +14883,13 @@
   // refinement steps.
   int Iterations = TLI.getDivRefinementSteps(VT, MF);
   if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
+    AddToWorklist(Est.getNode());
+
     if (Iterations) {
-      // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
-      // For the reciprocal, we need to find the zero of the function:
-      //   F(X) = A X - 1 [which has a zero at X = 1/A]
-      //     =>
-      //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
-      //     does not require additional intermediate precision]
       EVT VT = Op.getValueType();
       SDLoc DL(Op);
       SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
 
-      AddToWorklist(Est.getNode());
-
       // Newton iterations: Est = Est + Est (1 - Arg * Est)
       for (int i = 0; i < Iterations; ++i) {
         SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
@@ -15030,12 +15030,30 @@
 
   bool UseOneConstNR = false;
   if (SDValue Est =
-      TLI.getRsqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR)) {
+      TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
+                          Reciprocal)) {
     AddToWorklist(Est.getNode());
+
     if (Iterations) {
       Est = UseOneConstNR
-      ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
-      : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
+            ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
+            : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
+
+      if (!Reciprocal) {
+        // Unfortunately, Est is now NaN if the input was exactly 0.0.
+        // Select out this case and force the answer to 0.0.
+        EVT VT = Op.getValueType();
+        SDLoc DL(Op);
+
+        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+        EVT CCVT = getSetCCResultType(VT);
+        SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+        AddToWorklist(ZeroCmp.getNode());
+
+        Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
+                          ZeroCmp, FPZero, Est);
+        AddToWorklist(Est.getNode());
+      }
     }
     return Est;
   }
@@ -15048,23 +15066,7 @@
 }
 
 SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
-  SDValue Est = buildSqrtEstimateImpl(Op, Flags, false);
-  if (!Est)
-    return SDValue();
-
-  // Unfortunately, Est is now NaN if the input was exactly 0.
-  // Select out this case and force the answer to 0.
-  EVT VT = Est.getValueType();
-  SDLoc DL(Op);
-  SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
-  EVT CCVT = getSetCCResultType(VT);
-  SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, Zero, ISD::SETEQ);
-  AddToWorklist(ZeroCmp.getNode());
-
-  Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, ZeroCmp,
-                    Zero, Est);
-  AddToWorklist(Est.getNode());
-  return Est;
+  return buildSqrtEstimateImpl(Op, Flags, false);
 }
 
 /// Return true if base is a frame index, which is known not to alias with
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -187,9 +187,9 @@
   SMULL,
   UMULL,
 
-  // Reciprocal estimates.
-  FRECPE,
-  FRSQRTE,
+  // Reciprocal estimates and steps.
+  FRECPE, FRECPS,
+  FRSQRTE, FRSQRTS,
 
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -534,8 +534,9 @@
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
-  SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
-                           int &ExtraSteps, bool &UseOneConst) const override;
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                          int &ExtraSteps, bool &UseOneConst,
+                          bool Reciprocal) const override;
   SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                            int &ExtraSteps) const override;
   unsigned combineRepeatedFPDivisors() const override;
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -959,8 +959,10 @@
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
   case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
   case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
-  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
   case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
+  case AArch64ISD::FRECPS:            return "AArch64ISD::FRECPS";
+  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
+  case AArch64ISD::FRSQRTS:           return "AArch64ISD::FRSQRTS";
   }
   return nullptr;
 }
@@ -4632,15 +4634,43 @@
   return SDValue();
 }
 
-SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
-                                                SelectionDAG &DAG, int Enabled,
-                                                int &ExtraSteps,
-                                                bool &UseOneConst) const {
+SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
+                                               SelectionDAG &DAG, int Enabled,
+                                               int &ExtraSteps,
+                                               bool &UseOneConst,
+                                               bool Reciprocal) const {
   if (Enabled == ReciprocalEstimate::Enabled ||
       (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
                                        DAG, ExtraSteps)) {
-      UseOneConst = true;
+      SDLoc DL(Operand);
+      EVT VT = Operand.getValueType();
+
+      SDNodeFlags Flags;
+      Flags.setUnsafeAlgebra(true);
+
+      // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
+      // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
+      for (int i = ExtraSteps; i > 0; --i) {
+        SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
+                                   &Flags);
+        Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+      }
+
+      if (!Reciprocal) {
+        EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                      VT);
+        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+        SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
+
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
+        // Correct the result if the operand is 0.0.
+        Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
+                               VT, Eq, Operand, Estimate);
+      }
+
+      ExtraSteps = 0;
       return Estimate;
     }
 
@@ -4652,8 +4682,24 @@
                                                 int &ExtraSteps) const {
   if (Enabled == ReciprocalEstimate::Enabled)
     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
-                                       DAG, ExtraSteps))
+                                       DAG, ExtraSteps)) {
+      SDLoc DL(Operand);
+      EVT VT = Operand.getValueType();
+
+      SDNodeFlags Flags;
+      Flags.setUnsafeAlgebra(true);
+
+      // Newton reciprocal iteration: E * (2 - X * E)
+      // AArch64 reciprocal iteration instruction: (2 - M * N)
+      for (int i = ExtraSteps; i > 0; --i) {
+        SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
+                                   Estimate, &Flags);
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+      }
+
+      ExtraSteps = 0;
       return Estimate;
+    }
 
   return SDValue();
 }
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -287,7 +287,9 @@
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
 
 def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frecps   : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
 def AArch64frsqrte  : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+def AArch64frsqrts  : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
 
 def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
 def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
@@ -3422,6 +3424,17 @@
 def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
           (FRECPEv2f64 FPR128:$Rn)>;
 
+def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+          (FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+          (FRECPSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+          (FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+          (FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+          (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
           (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3447,6 +3460,17 @@
 def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
           (FRSQRTEv2f64 FPR128:$Rn)>;
 
+def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+          (FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+          (FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+          (FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+          (FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+          (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -164,9 +164,9 @@
   bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
     return true;
   }
-  SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
-                           int &RefinementSteps,
-                           bool &UseOneConstNR) const override;
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                           int &RefinementSteps, bool &UseOneConstNR,
+                           bool Reciprocal) const override;
   SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                            int &RefinementSteps) const override;
 
Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2858,10 +2858,11 @@
   return nullptr;
 }
 
-SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
-                                               SelectionDAG &DAG, int Enabled,
-                                               int &RefinementSteps,
-                                               bool &UseOneConstNR) const {
+SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
+                                              SelectionDAG &DAG, int Enabled,
+                                              int &RefinementSteps,
+                                              bool &UseOneConstNR,
+                                              bool Reciprocal) const {
   EVT VT = Operand.getValueType();
 
   if (VT == MVT::f32) {
Index: llvm/lib/Target/PowerPC/PPCISelLowering.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -967,9 +967,9 @@
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
 
-    SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
-                             int &RefinementSteps,
-                             bool &UseOneConstNR) const override;
+    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                            int &RefinementSteps, bool &UseOneConstNR,
+                            bool Reciprocal) const override;
     SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                              int &RefinementSteps) const override;
     unsigned combineRepeatedFPDivisors() const override;
Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9633,9 +9633,10 @@
   return RefinementSteps;
 }
 
-SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG,
-                                            int Enabled, int &RefinementSteps,
-                                            bool &UseOneConstNR) const {
+SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+                                           int Enabled, int &RefinementSteps,
+                                           bool &UseOneConstNR,
+                                           bool Reciprocal) const {
   EVT VT = Operand.getValueType();
   if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
       (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
Index: llvm/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.h
+++ llvm/lib/Target/X86/X86ISelLowering.h
@@ -1268,9 +1268,9 @@
     bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
 
     /// Use rsqrt* to speed up sqrt calculations.
-    SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
-                             int &RefinementSteps,
-                             bool &UseOneConstNR) const override;
+    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                            int &RefinementSteps, bool &UseOneConstNR,
+                            bool Reciprocal) const override;
 
     /// Use rcp* to speed up fdiv calculations.
     SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15228,10 +15228,11 @@
 
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
-SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
-                                            SelectionDAG &DAG, int Enabled,
-                                            int &RefinementSteps,
-                                            bool &UseOneConstNR) const {
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
+                                           SelectionDAG &DAG, int Enabled,
+                                           int &RefinementSteps,
+                                           bool &UseOneConstNR,
+                                           bool Reciprocal) const {
   EVT VT = Op.getValueType();
 
   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
Index: llvm/test/CodeGen/AArch64/recp-fastmath.ll
===================================================================
--- llvm/test/CodeGen/AArch64/recp-fastmath.ll
+++ llvm/test/CodeGen/AArch64/recp-fastmath.ll
@@ -16,8 +16,8 @@
 
 ; CHECK-LABEL: frecp1:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: frecpe
-; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe [[R:s[0-7]]]
+; CHECK-NEXT: frecps {{s[0-7](, s[0-7])?}}, [[R]]
 }
 
 define <2 x float> @f2recp0(<2 x float> %x) #0 {
@@ -36,8 +36,8 @@
 
 ; CHECK-LABEL: f2recp1:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecpe [[R:v[0-7]\.2s]]
+; CHECK-NEXT: frecps {{v[0-7]\.2s(, v[0-7].2s)?}}, [[R]]
 }
 
 define <4 x float> @f4recp0(<4 x float> %x) #0 {
@@ -56,8 +56,8 @@
 
 ; CHECK-LABEL: f4recp1:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]]
+; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]]
 }
 
 define <8 x float> @f8recp0(<8 x float> %x) #0 {
@@ -77,9 +77,10 @@
 
 ; CHECK-LABEL: f8recp1:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frecpe
-; CHECK: frecpe
+; CHECK-NEXT: frecpe [[RA:v[0-7]\.4s]]
+; CHECK-NEXT: frecpe [[RB:v[0-7]\.4s]]
+; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RA]]
+; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RB]]
 }
 
 define double @drecp0(double %x) #0 {
@@ -98,8 +99,8 @@
 
 ; CHECK-LABEL: drecp1:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: frecpe
-; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe [[R:d[0-7]]]
+; CHECK-NEXT: frecps {{d[0-7](, d[0-7])?}}, [[R]]
 }
 
 define <2 x double> @d2recp0(<2 x double> %x) #0 {
@@ -118,8 +119,8 @@
 
 ; CHECK-LABEL: d2recp1:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]]
+; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]]
 }
 
 define <4 x double> @d4recp0(<4 x double> %x) #0 {
@@ -139,9 +140,10 @@
 
 ; CHECK-LABEL: d4recp1:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frecpe
-; CHECK: frecpe
+; CHECK-NEXT: frecpe [[RA:v[0-7]\.2d]]
+; CHECK-NEXT: frecpe [[RB:v[0-7]\.2d]]
+; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RA]]
+; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RB]]
 }
 
 attributes #0 = { nounwind "unsafe-fp-math"="true" }
Index: llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -19,8 +19,10 @@
 
 ; CHECK-LABEL: fsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:s[0-7]]]
+; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]]
+; CHECK: fcmp s0, #0
 }
 
 define <2 x float> @f2sqrt(<2 x float> %a) #0 {
@@ -33,9 +35,10 @@
 
 ; CHECK-LABEL: f2sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]]
+; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]]
+; CHECK: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0
 }
 
 define <4 x float> @f4sqrt(<4 x float> %a) #0 {
@@ -48,9 +51,10 @@
 
 ; CHECK-LABEL: f4sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
+; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
+; CHECK: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
 }
 
 define <8 x float> @f8sqrt(<8 x float> %a) #0 {
@@ -64,10 +68,10 @@
 
 ; CHECK-LABEL: f8sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
-; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
+; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
+; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
+; CHECK: fcmeq {{v[0-7]\.4s, v[0-1]\.4s}}, #0
 }
 
 define double @dsqrt(double %a) #0 {
@@ -80,8 +84,10 @@
 
 ; CHECK-LABEL: dsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:d[0-7]]]
+; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]]
+; CHECK: fcmp d0, #0
 }
 
 define <2 x double> @d2sqrt(<2 x double> %a) #0 {
@@ -94,9 +100,10 @@
 
 ; CHECK-LABEL: d2sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
+; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
+; CHECK: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
 }
 
 define <4 x double> @d4sqrt(<4 x double> %a) #0 {
@@ -110,10 +117,10 @@
 
 ; CHECK-LABEL: d4sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
-; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
+; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
+; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
+; CHECK: fcmeq {{v[0-7]\.2d, v[0-1]\.2d}}, #0
 }
 
 define float @frsqrt(float %a) #0 {
@@ -127,8 +134,10 @@
 
 ; CHECK-LABEL: frsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:s[0-7]]]
+; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]]
+; CHECK-NOT: fcmp {{s[0-7]}}, #0
 }
 
 define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
@@ -142,8 +151,10 @@
 
 ; CHECK-LABEL: f2rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]]
+; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]]
+; CHECK-NOT: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0
 }
 
 define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
@@ -157,8 +168,10 @@
 
 ; CHECK-LABEL: f4rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
+; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
+; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
 }
 
 define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
@@ -173,9 +186,10 @@
 
 ; CHECK-LABEL: f8rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
+; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
+; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
+; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
 }
 
 define double @drsqrt(double %a) #0 {
@@ -189,8 +203,10 @@
 
 ; CHECK-LABEL: drsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:d[0-7]]]
+; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]]
+; CHECK-NOT: fcmp d0, #0
 }
 
 define <2 x double> @d2rsqrt(<2 x double> %a) #0 {
@@ -204,8 +220,10 @@
 
 ; CHECK-LABEL: d2rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
+; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
+; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
 }
 
 define <4 x double> @d4rsqrt(<4 x double> %a) #0 {
@@ -220,9 +238,10 @@
 
 ; CHECK-LABEL: d4rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
+; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
+; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
+; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
 }
 
 attributes #0 = { nounwind "unsafe-fp-math"="true" }