Index: llvm/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -537,6 +537,13 @@
     /// FSINCOS - Compute both fsin and fcos as a single operation.
     FSINCOS,
 
+    /// FRECPI - Perform one iteration of the Newton series approximation of
+    /// x^-1.
+    FRECPI,
+    /// FRSQRTI - Perform one iteration of the Newton series approximation of
+    /// x^-1/2.
+    FRSQRTI,
+
     /// LOAD and STORE have token chains as their first operand, then the same
     /// operands as an LLVM load/store instruction, then an offset node that
     /// is added / subtracted from the base pointer to form the address (for
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -364,6 +364,8 @@
                                 SDNodeFlags *Flags, bool Reciprocal);
     SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
                                 SDNodeFlags *Flags, bool Reciprocal);
+    SDValue buildSqrtNRNative(SDValue Op, SDValue Est, unsigned Iterations,
+                              SDNodeFlags *Flags, bool Reciprocal);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -14497,41 +14499,49 @@
   return S;
 }
 
-SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal, we need to find the zero of the function:
+///   F(X) = A X - 1 [which has a zero at X = 1/A]
+///     =>
+///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+///     does not require additional intermediate precision]
+SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Arg, SDNodeFlags *Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
   // Expose the DAG combiner to the target combiner implementations.
   TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
-
   unsigned Iterations = 0;
-  if (SDValue Est = TLI.getRecipEstimate(Op, DCI, Iterations)) {
+  if (SDValue Est = TLI.getRecipEstimate(Arg, DCI, Iterations)) {
     if (Iterations) {
-      // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
-      // For the reciprocal, we need to find the zero of the function:
-      //   F(X) = A X - 1 [which has a zero at X = 1/A]
-      //     =>
-      //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
-      //     does not require additional intermediate precision]
-      EVT VT = Op.getValueType();
-      SDLoc DL(Op);
-      SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
-
       AddToWorklist(Est.getNode());
 
-      // Newton iterations: Est = Est + Est (1 - Arg * Est)
-      for (unsigned i = 0; i < Iterations; ++i) {
-        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
-        AddToWorklist(NewEst.getNode());
+      EVT VT = Arg.getValueType();
+      SDLoc DL(Arg);
+
+      if (TLI.hasTargetDAGCombine((ISD::NodeType)ISD::FRECPI))
+        // Newton iterations for reciprocal.
+        for (unsigned i = 0; i < Iterations; ++i) {
+          Est = DAG.getNode(ISD::FRECPI, DL, VT, Arg, Est, Flags);
+          AddToWorklist(Est.getNode());
+        }
+      else {
+        SDValue One = DAG.getConstantFP(1.0, DL, VT);
+
+        // Newton iterations: Est = Est + Est (1 - Arg * Est)
+        for (unsigned i = 0; i < Iterations; ++i) {
+          SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
+          AddToWorklist(NewEst.getNode());
 
-        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
-        AddToWorklist(NewEst.getNode());
+          NewEst = DAG.getNode(ISD::FSUB, DL, VT, One, NewEst, Flags);
+          AddToWorklist(NewEst.getNode());
 
-        NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
-        AddToWorklist(NewEst.getNode());
+          NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
+          AddToWorklist(NewEst.getNode());
 
-        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
-        AddToWorklist(Est.getNode());
+          Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
+          AddToWorklist(Est.getNode());
+        }
       }
     }
     return Est;
@@ -14634,6 +14644,33 @@
   return Est;
 }
 
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
+SDValue DAGCombiner::buildSqrtNRNative(SDValue Arg, SDValue Est,
+                                       unsigned Iterations,
+                                       SDNodeFlags *Flags, bool Reciprocal) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+
+  // Newton iterations for reciprocal square root:
+  // E = the details are target dependent.
+  for (unsigned i = 0; i < Iterations; ++i) {
+    Est = DAG.getNode(ISD::FRSQRTI, DL, VT, Arg, Est, Flags);
+    AddToWorklist(Est.getNode());
+  }
+
+  if (!Reciprocal) {
+    // Calculate the square root  .
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
+    AddToWorklist(Est.getNode());
+  }
+
+  return Est;
+}
+
 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
 /// Op can be zero.
@@ -14649,9 +14686,13 @@
   if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
     AddToWorklist(Est.getNode());
     if (Iterations) {
-      Est = UseOneConstNR
-                ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
-                : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
+      // Check if the target supports the estimate step natively.
+      if (TLI.hasTargetDAGCombine((ISD::NodeType)ISD::FRSQRTI))
+        Est = buildSqrtNRNative(Op, Est, Iterations, Flags, Reciprocal);
+      else
+        Est = UseOneConstNR
+              ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
+              : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
     }
     return Est;
   }
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -200,6 +200,8 @@
   case ISD::FMA:                        return "fma";
   case ISD::FMAD:                       return "fmad";
   case ISD::FREM:                       return "frem";
+  case ISD::FRECPI:                     return "FRecpI";
+  case ISD::FRSQRTI:                    return "FRSqrtI";
   case ISD::FCOPYSIGN:                  return "fcopysign";
   case ISD::FGETSIGN:                   return "fgetsign";
   case ISD::FCANONICALIZE:              return "fcanonicalize";
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -187,9 +187,9 @@
   SMULL,
   UMULL,
 
-  // Reciprocal estimates.
-  FRECPE,
-  FRSQRTE,
+  // Reciprocal estimates and steps.
+  FRECPE, FRECPS,
+  FRSQRTE, FRSQRTS,
 
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -495,6 +495,10 @@
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
+  // AArch64 has instrs which perform part of the Newton series iterations.
+  setTargetDAGCombine(ISD::FRECPI);
+  setTargetDAGCombine(ISD::FRSQRTI);
+
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
@@ -953,8 +957,10 @@
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
   case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
   case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
-  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
   case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
+  case AArch64ISD::FRECPS:            return "AArch64ISD::FRECPS";
+  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
+  case AArch64ISD::FRSQRTS:           return "AArch64ISD::FRSQRTS";
   }
   return nullptr;
 }
@@ -4588,7 +4594,7 @@
 
 SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
   DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
-  UseOneConst = true;
+  UseOneConst = false;
   return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
 }
 
@@ -7810,6 +7816,45 @@
                      DAG.getConstant(C, DL, MVT::i32));
 }
 
+static SDValue performFRECPICombine
+  (SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) {
+  SDValue Arg = N->getOperand(0),
+          Est = N->getOperand(1);
+
+  SDLoc DL(Arg);
+  EVT VT = Arg.getValueType();
+
+  SDNodeFlags Flags;
+  Flags.setUnsafeAlgebra(true);
+
+  // Newton reciprocal iteration: Est * (2 - Arg * Est)
+  // AArch64 reciprocal iteration instruction: (2 - M * N)
+  SDValue NewEst = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Arg, Est, &Flags);
+  NewEst = DAG.getNode(ISD::FMUL, DL, VT, NewEst, Est, &Flags);
+
+  return NewEst;
+}
+
+static SDValue performFRSQRTICombine
+  (SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) {
+  SDValue Arg = N->getOperand(0),
+          Est = N->getOperand(1);
+
+  SDLoc DL(Arg);
+  EVT VT = Arg.getValueType();
+
+  SDNodeFlags Flags;
+  Flags.setUnsafeAlgebra(true);
+
+  // Newton reciprocal square root iteration: Est * 0.5 * (3 - Arg * Est^2)
+  // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
+  SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, &Flags);
+  NewEst = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Arg, NewEst, &Flags);
+  NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, &Flags);
+
+  return NewEst;
+}
+
 /// An EXTR instruction is made up of two shifts, ORed together. This helper
 /// searches for and classifies those shifts.
 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
@@ -9898,6 +9943,10 @@
     return performPostLD1Combine(N, DCI, true);
   case ISD::EXTRACT_VECTOR_ELT:
     return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
+  case ISD::FRECPI:
+    return performFRECPICombine(N, DAG, Subtarget);
+  case ISD::FRSQRTI:
+    return performFRSQRTICombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -287,7 +287,9 @@
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
 
 def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frecps   : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
 def AArch64frsqrte  : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+def AArch64frsqrts  : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
 
 def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
 def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
@@ -3414,6 +3416,17 @@
 def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
           (FRECPEv2f64 FPR128:$Rn)>;
 
+def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+          (FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+          (FRECPSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+          (FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+          (FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+          (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
           (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3439,6 +3452,17 @@
 def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
           (FRSQRTEv2f64 FPR128:$Rn)>;
 
+def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+          (FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+          (FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+          (FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+          (FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+          (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
Index: llvm/test/CodeGen/AArch64/recp-fastmath.ll
===================================================================
--- llvm/test/CodeGen/AArch64/recp-fastmath.ll
+++ llvm/test/CodeGen/AArch64/recp-fastmath.ll
@@ -13,7 +13,7 @@
 ; CHECK-LABEL: frecp:
 ; CHECK-NEXT: BB#0
 ; CHECK-NEXT: frecpe
-; CHECK-NEXT: fmov
+; CHECK-NEXT: frecps
 }
 
 define <2 x float> @f2recp(<2 x float> %x) #0 {
@@ -27,8 +27,8 @@
 
 ; CHECK-LABEL: f2recp:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecps
 }
 
 define <4 x float> @f4recp(<4 x float> %x) #0 {
@@ -42,8 +42,8 @@
 
 ; CHECK-LABEL: f4recp:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecps
 }
 
 define <8 x float> @f8recp(<8 x float> %x) #0 {
@@ -58,9 +58,10 @@
 
 ; CHECK-LABEL: f8recp:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frecpe
-; CHECK: frecpe
+; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecps
+; CHECK: frecps
 }
 
 define double @drecp(double %x) #0 {
@@ -75,7 +76,7 @@
 ; CHECK-LABEL: drecp:
 ; CHECK-NEXT: BB#0
 ; CHECK-NEXT: frecpe
-; CHECK-NEXT: fmov
+; CHECK-NEXT: frecps
 }
 
 define <2 x double> @d2recp(<2 x double> %x) #0 {
@@ -89,8 +90,8 @@
 
 ; CHECK-LABEL: d2recp:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecps
 }
 
 define <4 x double> @d4recp(<4 x double> %x) #0 {
@@ -105,9 +106,10 @@
 
 ; CHECK-LABEL: d4recp:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frecpe
-; CHECK: frecpe
+; CHECK-NEXT: frecpe
+; CHECK-NEXT: frecps
+; CHECK: frecps
 }
 
 attributes #0 = { nounwind "unsafe-fp-math"="true" }
Index: llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -21,8 +21,9 @@
 
 ; CHECK-LABEL: fsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <2 x float> @f2sqrt(<2 x float> %a) #0 {
@@ -35,9 +36,9 @@
 
 ; CHECK-LABEL: f2sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <4 x float> @f4sqrt(<4 x float> %a) #0 {
@@ -50,9 +51,9 @@
 
 ; CHECK-LABEL: f4sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <8 x float> @f8sqrt(<8 x float> %a) #0 {
@@ -66,10 +67,10 @@
 
 ; CHECK-LABEL: f8sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
 ; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrts
 }
 
 define double @dsqrt(double %a) #0 {
@@ -82,8 +83,9 @@
 
 ; CHECK-LABEL: dsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <2 x double> @d2sqrt(<2 x double> %a) #0 {
@@ -96,9 +98,9 @@
 
 ; CHECK-LABEL: d2sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <4 x double> @d4sqrt(<4 x double> %a) #0 {
@@ -112,10 +114,10 @@
 
 ; CHECK-LABEL: d4sqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
-; CHECK-NEXT: mov
 ; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrts
 }
 
 define float @frsqrt(float %a) #0 {
@@ -129,8 +131,9 @@
 
 ; CHECK-LABEL: frsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
@@ -144,8 +147,9 @@
 
 ; CHECK-LABEL: f2rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
@@ -159,8 +163,9 @@
 
 ; CHECK-LABEL: f4rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
@@ -175,9 +180,10 @@
 
 ; CHECK-LABEL: f8rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrts
 }
 
 define double @drsqrt(double %a) #0 {
@@ -191,8 +197,9 @@
 
 ; CHECK-LABEL: drsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <2 x double> @d2rsqrt(<2 x double> %a) #0 {
@@ -206,8 +213,9 @@
 
 ; CHECK-LABEL: d2rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrts
 }
 
 define <4 x double> @d4rsqrt(<4 x double> %a) #0 {
@@ -222,9 +230,10 @@
 
 ; CHECK-LABEL: d4rsqrt:
 ; CHECK-NEXT: BB#0
-; CHECK-NEXT: fmov
 ; CHECK-NEXT: frsqrte
-; CHECK: frsqrte
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsqrte
+; CHECK-NEXT: frsqrts
 }
 
 attributes #0 = { nounwind "unsafe-fp-math"="true" }