Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
===================================================================
--- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1181,6 +1181,13 @@
                                 const SrcOp &Op0, const SrcOp &Op1,
                                 std::optional<unsigned> Flags = std::nullopt);
 
+  /// Build and insert a \p Res = G_IS_FPCLASS \p Pred\p Src, \p Mask
+  MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src,
+                                     unsigned Mask) {
+    return buildInstr(TargetOpcode::G_IS_FPCLASS, {Res},
+                      {Src, SrcOp(static_cast<int64_t>(Mask))});
+  }
+
   /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1
   ///
   /// \pre setBasicBlock or setMI must have been called.
Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -157,6 +157,9 @@
   bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
 
+  bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     MachineIRBuilder &B) const;
+
   bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
                                  MachineIRBuilder &B) const;
 
Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -907,7 +907,12 @@
     .clampScalar(0, S16, S64);
 
   if (ST.has16BitInsts()) {
-    getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
+    getActionDefinitionsBuilder(G_FSQRT)
+      .legalFor({S32, S16})
+      .customFor({S64})
+      .scalarize(0)
+      .clampScalar(0, S16, S64);
+    getActionDefinitionsBuilder(G_FFLOOR)
       .legalFor({S32, S64, S16})
       .scalarize(0)
       .clampScalar(0, S16, S64);
@@ -925,7 +930,8 @@
       .lower();
   } else {
     getActionDefinitionsBuilder(G_FSQRT)
-      .legalFor({S32, S64})
+      .legalFor({S32})
+      .customFor({S64})
       .scalarize(0)
       .clampScalar(0, S32, S64);
 
@@ -1996,6 +2002,8 @@
     return legalizeFDIV(MI, MRI, B);
   case TargetOpcode::G_FFREXP:
     return legalizeFFREXP(MI, MRI, B);
+  case TargetOpcode::G_FSQRT:
+    return legalizeFSQRT(MI, MRI, B);
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_UREM:
   case TargetOpcode::G_UDIVREM:
@@ -4829,6 +4837,90 @@
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
+                                        MachineRegisterInfo &MRI,
+                                        MachineIRBuilder &B) const {
+  // For double type, the SQRT and RSQ instructions don't have required
+  // precision, we apply Goldschmidt's algorithm to improve the result:
+  //
+  //   y0 = rsq(x)
+  //   g0 = x * y0
+  //   h0 = 0.5 * y0
+  //
+  //   r0 = 0.5 - h0 * g0
+  //   g1 = g0 * r0 + g0
+  //   h1 = h0 * r0 + h0
+  //
+  //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+  //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
+  //   h2 = h1 * r1 + h1
+  //
+  //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+  //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
+  //
+  //   sqrt(x) = g3
+
+  const LLT S1 = LLT::scalar(1);
+  const LLT S32 = LLT::scalar(32);
+  const LLT F64 = LLT::scalar(64);
+
+  Register Dst = MI.getOperand(0).getReg();
+  assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
+
+  Register X = MI.getOperand(1).getReg();
+  unsigned Flags = MI.getFlags();
+
+  auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
+
+  auto ZeroInt = B.buildConstant(S32, 0);
+  auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
+
+  // Scale up input if it is too small.
+  auto ScaleUpFactor = B.buildConstant(S32, 256);
+  auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
+  auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
+
+  auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
+                   .addReg(SqrtX.getReg(0));
+
+  auto Half = B.buildFConstant(F64, 0.5);
+  auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
+  auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
+
+  auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
+  auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
+
+  auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
+  auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
+
+  auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
+  auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
+
+  auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
+
+  auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
+  auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
+
+  auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
+
+  // Scale down the result.
+  auto ScaleDownFactor = B.buildConstant(S32, -128);
+  auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
+  SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
+
+  // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+  // with finite only or nsz because rsq(+/-0) = +/-inf
+
+  // TODO: Check for DAZ and expand to subnormals
+  auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+
+  // If x is +INF, +0, or -0, use its original value
+  B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
 // FIXME: Why do we handle this one but not other removed instructions?
 //
Index: llvm/lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -109,6 +109,7 @@
   SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -219,6 +219,8 @@
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::FSQRT, MVT::f64, Custom);
+
   setOperationAction(ISD::SELECT_CC,
                      {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
 
@@ -4806,7 +4808,10 @@
            "Load should return a value and a chain");
     return Result;
   }
-
+  case ISD::FSQRT:
+    if (Op.getValueType() == MVT::f64)
+      return lowerFSQRTF64(Op, DAG);
+    return SDValue();
   case ISD::FSIN:
   case ISD::FCOS:
     return LowerTrig(Op, DAG);
@@ -9631,6 +9636,87 @@
   return SDValue();
 }
 
+SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
+  // For double type, the SQRT and RSQ instructions don't have required
+  // precision, we apply Goldschmidt's algorithm to improve the result:
+  //
+  //   y0 = rsq(x)
+  //   g0 = x * y0
+  //   h0 = 0.5 * y0
+  //
+  //   r0 = 0.5 - h0 * g0
+  //   g1 = g0 * r0 + g0
+  //   h1 = h0 * r0 + h0
+  //
+  //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+  //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
+  //   h2 = h1 * r1 + h1
+  //
+  //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+  //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
+  //
+  //   sqrt(x) = g3
+
+  SDNodeFlags Flags = Op->getFlags();
+
+  SDLoc DL(Op);
+
+  SDValue X = Op.getOperand(0);
+  SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
+
+  SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
+
+  SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
+
+  // Scale up input if it is too small.
+  SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
+  SDValue ScaleUp =
+      DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
+  SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
+
+  SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
+
+  SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
+
+  SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
+  SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
+
+  SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
+  SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
+
+  SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
+
+  SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
+
+  SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
+  SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
+
+  SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
+
+  SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
+  SDValue SqrtD1 =
+      DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
+
+  SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
+
+  SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
+  SDValue ScaleDown =
+      DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
+  SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
+
+  // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+  // with finite only or nsz because rsq(+/-0) = +/-inf
+
+  // TODO: Check for DAZ and expand to subnormals
+  SDValue IsZeroOrInf =
+      DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
+                  DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
+
+  // If x is +INF, +0, or -0, use its original value
+  return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
+                     Flags);
+}
+
 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -332,7 +332,7 @@
 let TRANS = 1, SchedRW = [WriteTrans64] in {
 defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
-defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
 } // End TRANS = 1, SchedRW = [WriteTrans64]
 
 let TRANS = 1, SchedRW = [WriteTrans32] in {
Index: llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
===================================================================
--- llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
+++ llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
@@ -52,10 +52,10 @@
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'fsqrt'
@@ -63,10 +63,10 @@
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call float @llvm.sqrt.f32(float undef)
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
@@ -1,9 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI  %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=SI,GCN  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefixes=VI,GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN  %s
 
 ---
 name: test_fsqrt_s32
@@ -11,24 +11,12 @@
   bb.0:
     liveins: $vgpr0
 
-    ; SI-LABEL: name: test_fsqrt_s32
-    ; SI: liveins: $vgpr0
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
-    ; SI-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
-    ; VI-LABEL: name: test_fsqrt_s32
-    ; VI: liveins: $vgpr0
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
-    ; VI-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
-    ; GFX9-LABEL: name: test_fsqrt_s32
-    ; GFX9: liveins: $vgpr0
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
+    ; GCN-LABEL: name: test_fsqrt_s32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
+    ; GCN-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = G_FSQRT %0
     $vgpr0 = COPY %1
@@ -40,28 +28,82 @@
   bb.0:
     liveins: $vgpr0
 
-    ; SI-LABEL: name: test_fsqrt_s64
-    ; SI: liveins: $vgpr0
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
-    ; VI-LABEL: name: test_fsqrt_s64
-    ; VI: liveins: $vgpr0
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
-    ; GFX9-LABEL: name: test_fsqrt_s64
-    ; GFX9: liveins: $vgpr0
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
+    ; GCN-LABEL: name: test_fsqrt_s64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+    ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]]
+    ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+    ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[COPY]], [[SELECT]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+    ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+    ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+    ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+    ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+    ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+    ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+    ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+    ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+    ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+    ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+    ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+    ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+    ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_FSQRT %0
     $vgpr0_vgpr1 = COPY %1
 
+...
+
+---
+name: test_fsqrt_s64_ninf
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: test_fsqrt_s64_ninf
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+    ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]]
+    ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+    ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[COPY]], [[SELECT]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+    ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+    ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+    ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+    ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+    ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+    ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+    ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+    ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+    ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+    ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+    ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+    ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+    ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = ninf G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = ninf G_FSQRT %0
+    $vgpr0_vgpr1 = COPY %1
+
 ...
 ---
 name: test_fsqrt_s16
@@ -108,33 +150,15 @@
   bb.0:
     liveins: $vgpr0_vgpr1
 
-    ; SI-LABEL: name: test_fsqrt_v2s32
-    ; SI: liveins: $vgpr0_vgpr1
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-LABEL: name: test_fsqrt_v2s32
-    ; VI: liveins: $vgpr0_vgpr1
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX9-LABEL: name: test_fsqrt_v2s32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; GCN-LABEL: name: test_fsqrt_v2s32
+    ; GCN: liveins: $vgpr0_vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
+    ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
+    ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_FSQRT %0
     $vgpr0_vgpr1 = COPY %1
@@ -146,36 +170,16 @@
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2
 
-    ; SI-LABEL: name: test_fsqrt_v3s32
-    ; SI: liveins: $vgpr0_vgpr1_vgpr2
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; SI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
-    ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
-    ; VI-LABEL: name: test_fsqrt_v3s32
-    ; VI: liveins: $vgpr0_vgpr1_vgpr2
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; VI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
-    ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-LABEL: name: test_fsqrt_v3s32
-    ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; GFX9-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ; GCN-LABEL: name: test_fsqrt_v3s32
+    ; GCN: liveins: $vgpr0_vgpr1_vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+    ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+    ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
+    ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
+    ; GCN-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
+    ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
     %1:_(<3 x  s32>) = G_FSQRT %0
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -187,33 +191,58 @@
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3
 
-    ; SI-LABEL: name: test_fsqrt_v2s64
-    ; SI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
-    ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
-    ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; VI-LABEL: name: test_fsqrt_v2s64
-    ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
-    ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
-    ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX9-LABEL: name: test_fsqrt_v2s64
-    ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
-    ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GCN-LABEL: name: test_fsqrt_v2s64
+    ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+    ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s64), [[C]]
+    ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+    ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[UV]], [[SELECT]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+    ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+    ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+    ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+    ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+    ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+    ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+    ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+    ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+    ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+    ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+    ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+    ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+    ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+    ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s64), [[C]]
+    ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP2:%[0-9]+]]:_(s64) = G_FLDEXP [[UV1]], [[SELECT3]](s32)
+    ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP2]](s64)
+    ; GCN-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[INT1]], [[C3]]
+    ; GCN-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP2]], [[INT1]]
+    ; GCN-NEXT: [[FNEG3:%[0-9]+]]:_(s64) = G_FNEG [[FMUL2]]
+    ; GCN-NEXT: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FNEG3]], [[FMUL3]], [[C3]]
+    ; GCN-NEXT: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FMUL3]], [[FMA7]], [[FMUL3]]
+    ; GCN-NEXT: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMUL2]], [[FMA7]], [[FMUL2]]
+    ; GCN-NEXT: [[FNEG4:%[0-9]+]]:_(s64) = G_FNEG [[FMA8]]
+    ; GCN-NEXT: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG4]], [[FMA8]], [[FLDEXP2]]
+    ; GCN-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMA8]]
+    ; GCN-NEXT: [[FNEG5:%[0-9]+]]:_(s64) = G_FNEG [[FMA11]]
+    ; GCN-NEXT: [[FMA12:%[0-9]+]]:_(s64) = G_FMA [[FNEG5]], [[FMA11]], [[FLDEXP2]]
+    ; GCN-NEXT: [[FMA13:%[0-9]+]]:_(s64) = G_FMA [[FMA12]], [[FMA9]], [[FMA11]]
+    ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP3:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA13]], [[SELECT4]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS1:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP2]](s64), 608
+    ; GCN-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS1]](s1), [[FLDEXP2]], [[FLDEXP3]]
+    ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT2]](s64), [[SELECT5]](s64)
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<2 x s64>) = G_FSQRT %0
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
Index: llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,48 +1,248 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
 
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
 
 define double @v_sqrt_f64(double %x) {
-; GCN-LABEL: v_sqrt_f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_fneg(double %x) {
-; GCN-LABEL: v_sqrt_f64_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], -v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fneg:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 9
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fneg:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.neg = fneg double %x
   %result = call double @llvm.sqrt.f64(double %x.neg)
   ret double %result
 }
 
 define double @v_sqrt_f64_fabs(double %x) {
-; GCN-LABEL: v_sqrt_f64_fabs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fabs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fabs:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %result = call double @llvm.sqrt.f64(double %x.fabs)
   ret double %result
 }
 
 define double @v_sqrt_f64_fneg_fabs(double %x) {
-; GCN-LABEL: v_sqrt_f64_fneg_fabs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fneg_fabs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 9
+; SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %x.fabs.neg = fneg double %x.fabs
   %result = call double @llvm.sqrt.f64(double %x.fabs.neg)
@@ -50,42 +250,245 @@
 }
 
 define double @v_sqrt_f64_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
-; GCN-LABEL: v_sqrt_f64_no_infs_attribute:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_nnan(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -98,12 +501,65 @@
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call ninf double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -116,12 +572,65 @@
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_afn:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_afn:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_afn:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call afn double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -134,12 +643,65 @@
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_afn_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -152,167 +714,1147 @@
 }
 
 define double @v_sqrt_f64_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_nnan_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan_ninf_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_afn(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_afn_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64_afn:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64_afn:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64_afn:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
 }
 
 define double @v_sqrt_f64_afn_nnan(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_fabs_afn_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call double @llvm.fabs.f64(double %x)
   %result = call afn ninf double @llvm.sqrt.f64(double %fabs)
   ret double %result
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
-; GCN-LABEL: v_sqrt_f64__approx_func_fp_math:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
-; GCN-LABEL: v_sqrt_f64__enough_unsafe_attrs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
-; GCN-LABEL: v_sqrt_f64__unsafe_attr:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__unsafe_attr:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
 }
 
 define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
-; GCN-LABEL: v_sqrt_v3f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    v_sqrt_f64_e32 v[4:5], v[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v3f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s6, 0
+; SDAG-NEXT:    s_brev_b32 s7, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
+; SDAG-NEXT:    v_rsq_f64_e32 v[10:11], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[12:13], v[0:1], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_mul_f64 v[14:15], v[2:3], v[8:9]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[8:9], 0.5
+; SDAG-NEXT:    v_mul_f64 v[16:17], v[4:5], v[10:11]
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5
+; SDAG-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5
+; SDAG-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13]
+; SDAG-NEXT:    v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15]
+; SDAG-NEXT:    v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17]
+; SDAG-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13]
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v13, 0x260
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e32 v14, 0, v12, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, 0, v12, s[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[6:7]
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v14
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v13
+; SDAG-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v15
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v13
+; SDAG-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v3f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s6, 0
+; GISEL-NEXT:    s_brev_b32 s7, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[6:7]
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v7
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
+; GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[12:13], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[0:1], v[6:7]
+; GISEL-NEXT:    v_mul_f64 v[14:15], v[8:9], 0.5
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[8:9]
+; GISEL-NEXT:    v_mul_f64 v[16:17], v[10:11], 0.5
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5
+; GISEL-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
+; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v12, vcc
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, v12, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[6:7]
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v14
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v13
+; GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v15
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v13
+; GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x)
   ret <3 x double> %result
 }
@@ -329,5 +1871,4 @@
 attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
 attributes #4 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GISEL: {{.*}}
-; SDAG: {{.*}}
+; GCN: {{.*}}
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -3,6 +3,7 @@
 declare float @llvm.amdgcn.rcp.f32(float) #0
 declare double @llvm.amdgcn.rcp.f64(double) #0
 
+declare double @llvm.amdgcn.sqrt.f64(double) #0
 declare double @llvm.sqrt.f64(double) #0
 declare float @llvm.sqrt.f32(float) #0
 
@@ -124,7 +125,15 @@
 
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f64:
 ; SI-NOT: v_rsq_f64_e32
-; SI: v_sqrt_f64
+; SI: v_rsq_f64
+; SI: v_mul_f64
+; SI: v_mul_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
 ; SI: v_rcp_f64
 define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
@@ -133,12 +142,42 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}safe_amdgcn_sqrt_rsq_rcp_pat_f64:
+; SI-NOT: v_rsq_f64_e32
+; SI: v_sqrt_f64
+; SI: v_rcp_f64
+define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+  %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f64:
+; SI: v_rsq_f64
+; SI: v_mul_f64
+; SI: v_mul_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_rcp_f64
+; SI: buffer_store_dwordx2
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+  %sqrt = call double @llvm.sqrt.f64(double %src)
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_amdgcn_sqrt_rsq_rcp_pat_f64:
 ; SI: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SQRT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
-  %sqrt = call double @llvm.sqrt.f64(double %src)
+define amdgpu_kernel void @unsafe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
+  %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, ptr addrspace(1) %out, align 8
   ret void
Index: llvm/test/CodeGen/AMDGPU/rsq.f64.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -15,8 +15,30 @@
 define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-SDAG-LABEL: s_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -37,8 +59,32 @@
 ;
 ; SI-GISEL-LABEL: s_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -59,7 +105,29 @@
 ;
 ; VI-SDAG-LABEL: s_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -77,7 +145,31 @@
 ;
 ; VI-GISEL-LABEL: s_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -107,8 +199,30 @@
 define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-SDAG-LABEL: s_rsq_f64_fabs:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -129,8 +243,32 @@
 ;
 ; SI-GISEL-LABEL: s_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -151,7 +289,29 @@
 ;
 ; VI-SDAG-LABEL: s_rsq_f64_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -169,7 +329,31 @@
 ;
 ; VI-GISEL-LABEL: s_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -200,8 +384,30 @@
 define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-SDAG-LABEL: s_neg_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -222,8 +428,32 @@
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -244,7 +474,29 @@
 ;
 ; VI-SDAG-LABEL: s_neg_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -262,7 +514,31 @@
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -292,8 +568,30 @@
 define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-SDAG-LABEL: s_neg_rsq_neg_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -314,8 +612,32 @@
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -336,7 +658,29 @@
 ;
 ; VI-SDAG-LABEL: s_neg_rsq_neg_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -354,7 +698,31 @@
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -386,8 +754,30 @@
 ; SI-SDAG-LABEL: v_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -407,8 +797,30 @@
 ; SI-GISEL-LABEL: v_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -428,7 +840,29 @@
 ; VI-SDAG-LABEL: v_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -445,7 +879,29 @@
 ; VI-GISEL-LABEL: v_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -467,8 +923,30 @@
 ; SI-SDAG-LABEL: v_rsq_f64_fabs:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -488,8 +966,30 @@
 ; SI-GISEL-LABEL: v_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -509,7 +1009,29 @@
 ; VI-SDAG-LABEL: v_rsq_f64_fabs:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -526,7 +1048,29 @@
 ; VI-GISEL-LABEL: v_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -549,8 +1093,30 @@
 ; SI-SDAG-LABEL: v_rsq_f64_missing_contract0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -570,8 +1136,30 @@
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -591,7 +1179,29 @@
 ; VI-SDAG-LABEL: v_rsq_f64_missing_contract0:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -608,7 +1218,29 @@
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -630,8 +1262,30 @@
 ; SI-SDAG-LABEL: v_rsq_f64_missing_contract1:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -651,8 +1305,30 @@
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -672,7 +1348,29 @@
 ; VI-SDAG-LABEL: v_rsq_f64_missing_contract1:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -689,7 +1387,29 @@
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -711,8 +1431,30 @@
 ; SI-SDAG-LABEL: v_neg_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -732,8 +1474,30 @@
 ; SI-GISEL-LABEL: v_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -753,7 +1517,29 @@
 ; VI-SDAG-LABEL: v_neg_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -770,7 +1556,29 @@
 ; VI-GISEL-LABEL: v_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -792,101 +1600,222 @@
 ; SI-SDAG-LABEL: v_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[18:19], v[6:7]
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v11
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v19
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT:    s_nop 0
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], 1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_div_scale_f64 v[13:14], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT:    v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
 ; VI-SDAG-NEXT:    s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -894,9 +1823,48 @@
 ; VI-GISEL-LABEL: v_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -929,101 +1897,222 @@
 ; SI-SDAG-LABEL: v_neg_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
-; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[18:19], v[6:7]
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v11
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v19
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT:    s_nop 0
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], -1.0
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], -1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], -1.0, v[2:3], -1.0
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT:    v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
 ; VI-SDAG-NEXT:    s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1031,9 +2120,48 @@
 ; VI-GISEL-LABEL: v_neg_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1066,8 +2194,30 @@
 ; SI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1089,43 +2239,105 @@
 ; SI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0xbff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v13
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v16, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v19
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v19
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], s[4:5]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1144,9 +2356,48 @@
 ; VI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1179,105 +2430,224 @@
 ; SI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
-; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[18:19], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v11
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v19
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; SI-SDAG-NEXT:    s_nop 0
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0xbff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v13
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v16, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v19
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT:    s_nop 1
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT:    v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
 ; VI-SDAG-NEXT:    s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1285,9 +2655,48 @@
 ; VI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1320,8 +2729,30 @@
 ; SI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 9
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1341,8 +2772,30 @@
 ; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1362,7 +2815,29 @@
 ; VI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 9
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1379,7 +2854,29 @@
 ; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1403,8 +2900,30 @@
 ; SI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1424,8 +2943,30 @@
 ; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1445,7 +2986,29 @@
 ; VI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1462,7 +3025,29 @@
 ; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1481,226 +3066,1033 @@
 }
 
 define double @v_rsq_f64__afn_fdiv(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_fdiv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_fdiv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_neg_rsq_f64__afn(double %x) {
-; SDAG-LABEL: v_neg_rsq_f64__afn:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_neg_rsq_f64__afn:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_neg_rsq_f64__afn:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_neg_rsq_f64__afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_neg_rsq_f64__afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_neg_rsq_f64__afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double -1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_ninf(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_ninf:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn ninf double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_nnan(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_nnan:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_nnan:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_nnan:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_nnan:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_nnan_ninf(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
-; SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double -1.0, %sqrt
   ret double %rsq
@@ -1710,8 +4102,30 @@
 ; SI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1731,8 +4145,30 @@
 ; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1752,7 +4188,29 @@
 ; VI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1769,7 +4227,29 @@
 ; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1788,71 +4268,250 @@
 }
 
 define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
-; SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[8:9], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[0:1], v[8:9]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[5:6], v[0:1]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[7:8], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], -v[0:1], v[5:6], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], -v[2:3], v[7:8], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[9:10], v[5:6], v[5:6]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[11:12], v[7:8], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv contract afn nnan ninf <2 x double> <double 1.0, double 1.0>, %sqrt
@@ -1860,34 +4519,155 @@
 }
 
 define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
-; SDAG-LABEL: s_rsq_f64_unsafe:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: s_rsq_f64_unsafe:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], s[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; GISEL-NEXT:    ; return to shader part epilog
+; SI-SDAG-LABEL: s_rsq_f64_unsafe:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT:    ; return to shader part epilog
+;
+; SI-GISEL-LABEL: s_rsq_f64_unsafe:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-NEXT:    ; return to shader part epilog
+;
+; VI-SDAG-LABEL: s_rsq_f64_unsafe:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-LABEL: s_rsq_f64_unsafe:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-NEXT:    ; return to shader part epilog
   %rsq = call contract double @llvm.sqrt.f64(double %x)
   %result = fdiv contract double 1.0, %rsq
   %cast = bitcast double %result to <2 x i32>
@@ -1901,32 +4681,147 @@
 }
 
 define double @v_rsq_f64_unsafe(double %x) #0 {
-; SDAG-LABEL: v_rsq_f64_unsafe:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64_unsafe:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64_unsafe:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64_unsafe:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64_unsafe:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64_unsafe:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call double @llvm.sqrt.f64(double %x)
   %rsq = fdiv double 1.0, %sqrt
   ret double %rsq
@@ -2190,7 +5085,29 @@
 ; SI-SDAG-LABEL: v_div_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2210,7 +5127,29 @@
 ; SI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2230,7 +5169,29 @@
 ; VI-SDAG-LABEL: v_div_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2247,7 +5208,29 @@
 ; VI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2269,7 +5252,29 @@
 ; SI-SDAG-LABEL: v_div_arcp_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2289,7 +5294,29 @@
 ; SI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2309,7 +5336,29 @@
 ; VI-SDAG-LABEL: v_div_arcp_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2326,7 +5375,29 @@
 ; VI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2348,7 +5419,29 @@
 ; SI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2368,7 +5461,29 @@
 ; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2388,7 +5503,29 @@
 ; VI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2405,7 +5542,29 @@
 ; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2427,9 +5586,30 @@
 ; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_brev_b32 s7, 8
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -2449,10 +5629,31 @@
 ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_brev_b32 s7, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], s[6:7], v[0:1], s[6:7]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -2472,9 +5673,30 @@
 ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40700000
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -2491,9 +5713,30 @@
 ; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -2514,3 +5757,5 @@
 attributes #0 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
+; GISEL: {{.*}}
+; SDAG: {{.*}}