Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1181,6 +1181,13 @@ const SrcOp &Op0, const SrcOp &Op1, std::optional Flags = std::nullopt); + /// Build and insert a \p Res = G_IS_FPCLASS \p Pred\p Src, \p Mask + MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src, + unsigned Mask) { + return buildInstr(TargetOpcode::G_IS_FPCLASS, {Res}, + {Src, SrcOp(static_cast(Mask))}); + } + /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1 /// /// \pre setBasicBlock or setMI must have been called. Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -157,6 +157,9 @@ bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -907,7 +907,12 @@ .clampScalar(0, S16, S64); if (ST.has16BitInsts()) { - getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) + getActionDefinitionsBuilder(G_FSQRT) + .legalFor({S32, S16}) + .customFor({S64}) + .scalarize(0) + .clampScalar(0, S16, S64); + getActionDefinitionsBuilder(G_FFLOOR) .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); @@ -925,7 +930,8 @@ .lower(); } else { getActionDefinitionsBuilder(G_FSQRT) - .legalFor({S32, S64}) + .legalFor({S32}) + .customFor({S64}) .scalarize(0) .clampScalar(0, S32, S64); @@ -1996,6 +2002,8 @@ return legalizeFDIV(MI, MRI, B); case TargetOpcode::G_FFREXP: return legalizeFFREXP(MI, MRI, B); + case TargetOpcode::G_FSQRT: + return legalizeFSQRT(MI, MRI, B); case TargetOpcode::G_UDIV: case TargetOpcode::G_UREM: case TargetOpcode::G_UDIVREM: @@ -4829,6 +4837,90 @@ return true; } +bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // For double type, the SQRT and RSQ instructions don't have required + // precision, we apply Goldschmidt's algorithm to improve the result: + // + // y0 = rsq(x) + // g0 = x * y0 + // h0 = 0.5 * y0 + // + // r0 = 0.5 - h0 * g0 + // g1 = g0 * r0 + g0 + // h1 = h0 * r0 + h0 + // + // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 + // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 + // h2 = h1 * r1 + h1 + // + // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 + // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 + // + // sqrt(x) = g3 + + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + const LLT F64 = LLT::scalar(64); + + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); + + Register X = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + + auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); + + auto ZeroInt = B.buildConstant(S32, 0); + auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); + + // Scale up input if it is too small. + auto ScaleUpFactor = B.buildConstant(S32, 256); + auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); + auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); + + auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false) + .addReg(SqrtX.getReg(0)); + + auto Half = B.buildFConstant(F64, 0.5); + auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); + auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); + + auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); + auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); + + auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); + auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); + + auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); + auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); + + auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); + + auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); + auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); + + auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); + + // Scale down the result. + auto ScaleDownFactor = B.buildConstant(S32, -128); + auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); + SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); + + // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check + // with finite only or nsz because rsq(+/-0) = +/-inf + + // TODO: Check for DAZ and expand to subnormals + auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); + + // If x is +INF, +0, or -0, use its original value + B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); + + MI.eraseFromParent(); + return true; +} + // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. // FIXME: Why do we handle this one but not other removed instructions? // Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -109,6 +109,7 @@ SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -219,6 +219,8 @@ setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); + setOperationAction(ISD::FSQRT, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); @@ -4806,7 +4808,10 @@ "Load should return a value and a chain"); return Result; } - + case ISD::FSQRT: + if (Op.getValueType() == MVT::f64) + return lowerFSQRTF64(Op, DAG); + return SDValue(); case ISD::FSIN: case ISD::FCOS: return LowerTrig(Op, DAG); @@ -9631,6 +9636,87 @@ return SDValue(); } +SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { + // For double type, the SQRT and RSQ instructions don't have required + // precision, we apply Goldschmidt's algorithm to improve the result: + // + // y0 = rsq(x) + // g0 = x * y0 + // h0 = 0.5 * y0 + // + // r0 = 0.5 - h0 * g0 + // g1 = g0 * r0 + g0 + // h1 = h0 * r0 + h0 + // + // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 + // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 + // h2 = h1 * r1 + h1 + // + // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 + // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 + // + // sqrt(x) = g3 + + SDNodeFlags Flags = Op->getFlags(); + + SDLoc DL(Op); + + SDValue X = Op.getOperand(0); + SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); + + SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); + + SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); + + // Scale up input if it is too small. + SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); + SDValue ScaleUp = + DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); + SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); + + SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); + + SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY); + + SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64); + SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half); + + SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0); + SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half); + + SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0); + + SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0); + + SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1); + SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX); + + SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); + + SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); + SDValue SqrtD1 = + DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); + + SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); + + SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32); + SDValue ScaleDown = + DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); + SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); + + // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check + // with finite only or nsz because rsq(+/-0) = +/-inf + + // TODO: Check for DAZ and expand to subnormals + SDValue IsZeroOrInf = + DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, + DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); + + // If x is +INF, +0, or -0, use its original value + return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet, + Flags); +} + SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -332,7 +332,7 @@ let TRANS = 1, SchedRW = [WriteTrans64] in { defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; -defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>; +defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; } // End TRANS = 1, SchedRW = [WriteTrans64] let TRANS = 1, SchedRW = [WriteTrans32] in { Index: llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll +++ llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll @@ -52,10 +52,10 @@ ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; ALL-SIZE-LABEL: 'fsqrt' @@ -63,10 +63,10 @@ ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %F32 = call float @llvm.sqrt.f32(float undef) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir @@ -1,9 +1,9 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=SI,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefixes=VI,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s --- name: test_fsqrt_s32 @@ -11,24 +11,12 @@ bb.0: liveins: $vgpr0 - ; SI-LABEL: name: test_fsqrt_s32 - ; SI: liveins: $vgpr0 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] - ; SI-NEXT: $vgpr0 = COPY [[FSQRT]](s32) - ; VI-LABEL: name: test_fsqrt_s32 - ; VI: liveins: $vgpr0 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] - ; VI-NEXT: $vgpr0 = COPY [[FSQRT]](s32) - ; GFX9-LABEL: name: test_fsqrt_s32 - ; GFX9: liveins: $vgpr0 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] - ; GFX9-NEXT: $vgpr0 = COPY [[FSQRT]](s32) + ; GCN-LABEL: name: test_fsqrt_s32 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] + ; GCN-NEXT: $vgpr0 = COPY [[FSQRT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FSQRT %0 $vgpr0 = COPY %1 @@ -40,28 +28,82 @@ bb.0: liveins: $vgpr0 - ; SI-LABEL: name: test_fsqrt_s64 - ; SI: liveins: $vgpr0 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]] - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64) - ; VI-LABEL: name: test_fsqrt_s64 - ; VI: liveins: $vgpr0 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]] - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64) - ; GFX9-LABEL: name: test_fsqrt_s64 - ; GFX9: liveins: $vgpr0 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]] - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64) + ; GCN-LABEL: name: test_fsqrt_s64 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]] + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[COPY]], [[SELECT]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64) + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]] + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]] + ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]] + ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]] + ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32) + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608 + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]] + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_FSQRT %0 $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_fsqrt_s64_ninf +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: test_fsqrt_s64_ninf + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]] + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[COPY]], [[SELECT]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64) + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]] + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]] + ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]] + ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]] + ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[FMA6]], [[SELECT1]](s32) + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608 + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = ninf G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]] + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = ninf G_FSQRT %0 + $vgpr0_vgpr1 = COPY %1 + ... --- name: test_fsqrt_s16 @@ -108,33 +150,15 @@ bb.0: liveins: $vgpr0_vgpr1 - ; SI-LABEL: name: test_fsqrt_v2s32 - ; SI: liveins: $vgpr0_vgpr1 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; VI-LABEL: name: test_fsqrt_v2s32 - ; VI: liveins: $vgpr0_vgpr1 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; GFX9-LABEL: name: test_fsqrt_v2s32 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GCN-LABEL: name: test_fsqrt_v2s32 + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] + ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FSQRT %0 $vgpr0_vgpr1 = COPY %1 @@ -146,36 +170,16 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2 - ; SI-LABEL: name: test_fsqrt_v3s32 - ; SI: liveins: $vgpr0_vgpr1_vgpr2 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; SI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) - ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - ; VI-LABEL: name: test_fsqrt_v3s32 - ; VI: liveins: $vgpr0_vgpr1_vgpr2 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; VI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) - ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-LABEL: name: test_fsqrt_v3s32 - ; GFX9: liveins: $vgpr0_vgpr1_vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; GFX9-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; GCN-LABEL: name: test_fsqrt_v3s32 + ; GCN: liveins: $vgpr0_vgpr1_vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] + ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] + ; GCN-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FSQRT %0 $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -187,33 +191,58 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; SI-LABEL: name: test_fsqrt_v2s64 - ; SI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]] - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]] - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64) - ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; VI-LABEL: name: test_fsqrt_v2s64 - ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]] - ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64) - ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; GFX9-LABEL: name: test_fsqrt_v2s64 - ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]] - ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GCN-LABEL: name: test_fsqrt_v2s64 + ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s64), [[C]] + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[UV]], [[SELECT]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64) + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]] + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]] + ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]] + ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]] + ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32) + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608 + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]] + ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s64), [[C]] + ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP2:%[0-9]+]]:_(s64) = G_FLDEXP [[UV1]], [[SELECT3]](s32) + ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP2]](s64) + ; GCN-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[INT1]], [[C3]] + ; GCN-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP2]], [[INT1]] + ; GCN-NEXT: [[FNEG3:%[0-9]+]]:_(s64) = G_FNEG [[FMUL2]] + ; GCN-NEXT: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FNEG3]], [[FMUL3]], [[C3]] + ; GCN-NEXT: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FMUL3]], [[FMA7]], [[FMUL3]] + ; GCN-NEXT: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMUL2]], [[FMA7]], [[FMUL2]] + ; GCN-NEXT: [[FNEG4:%[0-9]+]]:_(s64) = G_FNEG [[FMA8]] + ; GCN-NEXT: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG4]], [[FMA8]], [[FLDEXP2]] + ; GCN-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMA8]] + ; GCN-NEXT: [[FNEG5:%[0-9]+]]:_(s64) = G_FNEG [[FMA11]] + ; GCN-NEXT: [[FMA12:%[0-9]+]]:_(s64) = G_FMA [[FNEG5]], [[FMA11]], [[FLDEXP2]] + ; GCN-NEXT: [[FMA13:%[0-9]+]]:_(s64) = G_FMA [[FMA12]], [[FMA9]], [[FMA11]] + ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP3:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA13]], [[SELECT4]](s32) + ; GCN-NEXT: [[IS_FPCLASS1:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP2]](s64), 608 + ; GCN-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS1]](s1), [[FLDEXP2]], [[FLDEXP3]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT2]](s64), [[SELECT5]](s64) + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s64>) = G_FSQRT %0 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 Index: llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1,48 +1,248 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s define double @v_sqrt_f64(double %x) { -; GCN-LABEL: v_sqrt_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_fneg(double %x) { -; GCN-LABEL: v_sqrt_f64_fneg: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fneg: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 9 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fneg: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %x.neg = fneg double %x %result = call double @llvm.sqrt.f64(double %x.neg) ret double %result } define double @v_sqrt_f64_fabs(double %x) { -; GCN-LABEL: v_sqrt_f64_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fabs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fabs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call double @llvm.fabs.f64(double %x) %result = call double @llvm.sqrt.f64(double %x.fabs) ret double %result } define double @v_sqrt_f64_fneg_fabs(double %x) { -; GCN-LABEL: v_sqrt_f64_fneg_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fneg_fabs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 9 +; SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fneg_fabs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call double @llvm.fabs.f64(double %x) %x.fabs.neg = fneg double %x.fabs %result = call double @llvm.sqrt.f64(double %x.fabs.neg) @@ -50,42 +250,245 @@ } define double @v_sqrt_f64_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call ninf double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" { -; GCN-LABEL: v_sqrt_f64_no_infs_attribute: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_no_infs_attribute: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_no_infs_attribute: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call ninf double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_nnan(double %x) { -; GCN-LABEL: v_sqrt_f64_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nnan: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nnan: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nnan double @llvm.sqrt.f64(double %x) ret double %result } define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) { -; GCN-LABEL: s_sqrt_f64: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -98,12 +501,65 @@ } define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) { -; GCN-LABEL: s_sqrt_f64_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call ninf double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -116,12 +572,65 @@ } define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) { -; GCN-LABEL: s_sqrt_f64_afn: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64_afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64_afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call afn double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -134,12 +643,65 @@ } define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) { -; GCN-LABEL: s_sqrt_f64_afn_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call afn nnan ninf double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -152,167 +714,1147 @@ } define double @v_sqrt_f64_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_nnan_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nnan ninf double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_nnan_ninf_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_nnan_ninf_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nnan ninf nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_afn(double %x) { -; GCN-LABEL: v_sqrt_f64_afn: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_afn_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nsz double @llvm.sqrt.f64(double %x) ret double %result } define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { -; GCN-LABEL: v_sqrt_v2f64_afn: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v2f64_afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5] +; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v2f64_afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) ret <2 x double> %result } define double @v_sqrt_f64_afn_nnan(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nnan: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nnan: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_fabs_afn_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_fabs_afn_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %fabs = call double @llvm.fabs.f64(double %x) %result = call afn ninf double @llvm.sqrt.f64(double %fabs) ret double %result } define double @v_sqrt_f64_afn_nnan_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan ninf double @llvm.sqrt.f64(double %x) ret double %result } define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { -; GCN-LABEL: v_sqrt_v2f64_afn_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5] +; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) ret <2 x double> %result } define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 { -; GCN-LABEL: v_sqrt_f64__approx_func_fp_math: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 { -; GCN-LABEL: v_sqrt_f64__enough_unsafe_attrs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64__unsafe_attr(double %x) #4 { -; GCN-LABEL: v_sqrt_f64__unsafe_attr: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64__unsafe_attr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64__unsafe_attr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { -; GCN-LABEL: v_sqrt_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v2f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5] +; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v2f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) ret <2 x double> %result } define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { -; GCN-LABEL: v_sqrt_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: v_sqrt_f64_e32 v[4:5], v[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v3f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s6, 0 +; SDAG-NEXT: s_brev_b32 s7, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] +; SDAG-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] +; SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9] +; SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 +; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[10:11] +; SDAG-NEXT: v_mul_f64 v[10:11], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5 +; SDAG-NEXT: v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5 +; SDAG-NEXT: v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] +; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] +; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13] +; SDAG-NEXT: v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15] +; SDAG-NEXT: v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] +; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v13, 0x260 +; SDAG-NEXT: v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15] +; SDAG-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 +; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 +; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 +; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v3f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s6, 0 +; GISEL-NEXT: s_brev_b32 s7, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[6:7] +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v7 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] +; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] +; GISEL-NEXT: v_mul_f64 v[12:13], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] +; GISEL-NEXT: v_mul_f64 v[14:15], v[8:9], 0.5 +; GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9] +; GISEL-NEXT: v_mul_f64 v[16:17], v[10:11], 0.5 +; GISEL-NEXT: v_mul_f64 v[10:11], v[4:5], v[10:11] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5 +; GISEL-NEXT: v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5 +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] +; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] +; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] +; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] +; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] +; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] +; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] +; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] +; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc +; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 +; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 +; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 +; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x) ret <3 x double> %result } @@ -329,5 +1871,4 @@ attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" } attributes #4 = { "unsafe-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GISEL: {{.*}} -; SDAG: {{.*}} +; GCN: {{.*}} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -3,6 +3,7 @@ declare float @llvm.amdgcn.rcp.f32(float) #0 declare double @llvm.amdgcn.rcp.f64(double) #0 +declare double @llvm.amdgcn.sqrt.f64(double) #0 declare double @llvm.sqrt.f64(double) #0 declare float @llvm.sqrt.f32(float) #0 @@ -124,7 +125,15 @@ ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f64: ; SI-NOT: v_rsq_f64_e32 -; SI: v_sqrt_f64 +; SI: v_rsq_f64 +; SI: v_mul_f64 +; SI: v_mul_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 ; SI: v_rcp_f64 define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 { %sqrt = call double @llvm.sqrt.f64(double %src) @@ -133,12 +142,42 @@ ret void } +; FUNC-LABEL: {{^}}safe_amdgcn_sqrt_rsq_rcp_pat_f64: +; SI-NOT: v_rsq_f64_e32 +; SI: v_sqrt_f64 +; SI: v_rcp_f64 +define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { + %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src) + %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f64: +; SI: v_rsq_f64 +; SI: v_mul_f64 +; SI: v_mul_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_rcp_f64 +; SI: buffer_store_dwordx2 +define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { + %sqrt = call double @llvm.sqrt.f64(double %src) + %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}unsafe_amdgcn_sqrt_rsq_rcp_pat_f64: ; SI: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SQRT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 { - %sqrt = call double @llvm.sqrt.f64(double %src) +define amdgpu_kernel void @unsafe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 { + %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) store double %rcp, ptr addrspace(1) %out, align 8 ret void Index: llvm/test/CodeGen/AMDGPU/rsq.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -15,8 +15,30 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) { ; SI-SDAG-LABEL: s_rsq_f64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -37,8 +59,32 @@ ; ; SI-GISEL-LABEL: s_rsq_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: s_brev_b32 s3, 8 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -59,7 +105,29 @@ ; ; VI-SDAG-LABEL: s_rsq_f64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -77,7 +145,31 @@ ; ; VI-GISEL-LABEL: s_rsq_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-GISEL-NEXT: s_mov_b32 s2, 0 +; VI-GISEL-NEXT: s_brev_b32 s3, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -107,8 +199,30 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) { ; SI-SDAG-LABEL: s_rsq_f64_fabs: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], s[2:3], exec +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -129,8 +243,32 @@ ; ; SI-GISEL-LABEL: s_rsq_f64_fabs: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; SI-GISEL-NEXT: s_brev_b32 s3, 8 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -151,7 +289,29 @@ ; ; VI-SDAG-LABEL: s_rsq_f64_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], s[2:3], exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -169,7 +329,31 @@ ; ; VI-GISEL-LABEL: s_rsq_f64_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; VI-GISEL-NEXT: s_mov_b32 s2, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_brev_b32 s3, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -200,8 +384,30 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) { ; SI-SDAG-LABEL: s_neg_rsq_f64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -222,8 +428,32 @@ ; ; SI-GISEL-LABEL: s_neg_rsq_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: s_brev_b32 s3, 8 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -244,7 +474,29 @@ ; ; VI-SDAG-LABEL: s_neg_rsq_f64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -262,7 +514,31 @@ ; ; VI-GISEL-LABEL: s_neg_rsq_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-GISEL-NEXT: s_mov_b32 s2, 0 +; VI-GISEL-NEXT: s_brev_b32 s3, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -292,8 +568,30 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) { ; SI-SDAG-LABEL: s_neg_rsq_neg_f64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 9 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -314,8 +612,32 @@ ; ; SI-GISEL-LABEL: s_neg_rsq_neg_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; SI-GISEL-NEXT: s_brev_b32 s3, 8 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -336,7 +658,29 @@ ; ; VI-SDAG-LABEL: s_neg_rsq_neg_f64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 9 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -354,7 +698,31 @@ ; ; VI-GISEL-LABEL: s_neg_rsq_neg_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; VI-GISEL-NEXT: s_mov_b32 s2, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_brev_b32 s3, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -386,8 +754,30 @@ ; SI-SDAG-LABEL: v_rsq_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -407,8 +797,30 @@ ; SI-GISEL-LABEL: v_rsq_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -428,7 +840,29 @@ ; VI-SDAG-LABEL: v_rsq_f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -445,7 +879,29 @@ ; VI-GISEL-LABEL: v_rsq_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -467,8 +923,30 @@ ; SI-SDAG-LABEL: v_rsq_f64_fabs: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -488,8 +966,30 @@ ; SI-GISEL-LABEL: v_rsq_f64_fabs: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -509,7 +1009,29 @@ ; VI-SDAG-LABEL: v_rsq_f64_fabs: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -526,7 +1048,29 @@ ; VI-GISEL-LABEL: v_rsq_f64_fabs: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -549,8 +1093,30 @@ ; SI-SDAG-LABEL: v_rsq_f64_missing_contract0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -570,8 +1136,30 @@ ; SI-GISEL-LABEL: v_rsq_f64_missing_contract0: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -591,7 +1179,29 @@ ; VI-SDAG-LABEL: v_rsq_f64_missing_contract0: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -608,7 +1218,29 @@ ; VI-GISEL-LABEL: v_rsq_f64_missing_contract0: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -630,8 +1262,30 @@ ; SI-SDAG-LABEL: v_rsq_f64_missing_contract1: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -651,8 +1305,30 @@ ; SI-GISEL-LABEL: v_rsq_f64_missing_contract1: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -672,7 +1348,29 @@ ; VI-SDAG-LABEL: v_rsq_f64_missing_contract1: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -689,7 +1387,29 @@ ; VI-GISEL-LABEL: v_rsq_f64_missing_contract1: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -711,8 +1431,30 @@ ; SI-SDAG-LABEL: v_neg_rsq_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -732,8 +1474,30 @@ ; SI-GISEL-LABEL: v_neg_rsq_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -753,7 +1517,29 @@ ; VI-SDAG-LABEL: v_neg_rsq_f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -770,7 +1556,29 @@ ; VI-GISEL-LABEL: v_neg_rsq_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -792,101 +1600,222 @@ ; SI-SDAG-LABEL: v_rsq_v2f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 -; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 -; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 ; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] +; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] ; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17] -; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19] +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 ; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 -; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13] +; SI-SDAG-NEXT: s_nop 0 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_rsq_v2f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000 -; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] +; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v13 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19 +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] ; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] -; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; SI-GISEL-NEXT: s_nop 0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_rsq_v2f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[11:12], v[7:8] +; VI-SDAG-NEXT: v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10] +; VI-SDAG-NEXT: v_div_scale_f64 v[13:14], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12] +; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10] +; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12] +; VI-SDAG-NEXT: v_mul_f64 v[15:16], v[13:14], v[9:10] +; VI-SDAG-NEXT: v_mul_f64 v[19:20], v[17:18], v[11:12] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16] ; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5] -; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20] ; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -894,9 +1823,48 @@ ; VI-GISEL-LABEL: v_rsq_v2f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] @@ -929,101 +1897,222 @@ ; SI-SDAG-LABEL: v_neg_rsq_v2f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 -; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0 -; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 ; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] +; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] ; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17] -; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19] +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 ; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 -; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13] +; SI-SDAG-NEXT: s_nop 0 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_neg_rsq_v2f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 -; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0 -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] +; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v13 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19 +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] ; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] -; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], -1.0 +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; SI-GISEL-NEXT: s_nop 0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_neg_rsq_v2f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 -; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0 -; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0 -; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], -1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], -1.0, v[2:3], -1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[11:12], v[7:8] +; VI-SDAG-NEXT: v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10] +; VI-SDAG-NEXT: v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12] +; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10] +; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12] +; VI-SDAG-NEXT: v_mul_f64 v[15:16], v[13:14], v[9:10] +; VI-SDAG-NEXT: v_mul_f64 v[19:20], v[17:18], v[11:12] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16] ; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5] -; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20] ; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1031,9 +2120,48 @@ ; VI-GISEL-LABEL: v_neg_rsq_v2f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] @@ -1066,8 +2194,30 @@ ; SI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -1089,43 +2239,105 @@ ; SI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; SI-GISEL-NEXT: v_mov_b32_e32 v16, 0xbff00000 -; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] +; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v13 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v13 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] ; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] -; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; SI-GISEL-NEXT: s_nop 0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1144,9 +2356,48 @@ ; VI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] ; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] @@ -1179,105 +2430,224 @@ ; SI-SDAG-LABEL: v_neg_pos_rsq_v2f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 -; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 -; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 ; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] +; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] ; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3ff00000 -; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17] -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 ; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v19 ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-SDAG-NEXT: s_nop 0 -; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13] +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_neg_pos_rsq_v2f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; SI-GISEL-NEXT: v_mov_b32_e32 v16, 0xbff00000 -; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] +; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v13 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v13 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v8, v19 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 -; SI-GISEL-NEXT: s_nop 1 -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] -; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; SI-GISEL-NEXT: s_nop 0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_neg_pos_rsq_v2f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 -; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[11:12], v[7:8] +; VI-SDAG-NEXT: v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10] +; VI-SDAG-NEXT: v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12] +; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10] +; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12] +; VI-SDAG-NEXT: v_mul_f64 v[15:16], v[13:14], v[9:10] +; VI-SDAG-NEXT: v_mul_f64 v[19:20], v[17:18], v[11:12] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16] ; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5] -; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20] ; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1285,9 +2655,48 @@ ; VI-GISEL-LABEL: v_neg_pos_rsq_v2f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] @@ -1320,8 +2729,30 @@ ; SI-SDAG-LABEL: v_rsq_f64_fneg_fabs: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 9 +; SI-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -1341,8 +2772,30 @@ ; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -1362,7 +2815,29 @@ ; VI-SDAG-LABEL: v_rsq_f64_fneg_fabs: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 9 +; VI-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1379,7 +2854,29 @@ ; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1403,8 +2900,30 @@ ; SI-SDAG-LABEL: v_rsq_f64__afn_sqrt: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -1424,8 +2943,30 @@ ; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -1445,7 +2986,29 @@ ; VI-SDAG-LABEL: v_rsq_f64__afn_sqrt: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1462,7 +3025,29 @@ ; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1481,226 +3066,1033 @@ } define double @v_rsq_f64__afn_fdiv(double %x) { -; SDAG-LABEL: v_rsq_f64__afn_fdiv: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_rsq_f64__afn_fdiv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_rsq_f64__afn_fdiv: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64__afn_fdiv: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn double 1.0, %sqrt ret double %rsq } define double @v_rsq_f64__afn(double %x) { -; SDAG-LABEL: v_rsq_f64__afn: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_rsq_f64__afn: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_rsq_f64__afn: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64__afn: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64__afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64__afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn double 1.0, %sqrt ret double %rsq } define double @v_neg_rsq_f64__afn(double %x) { -; SDAG-LABEL: v_neg_rsq_f64__afn: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_neg_rsq_f64__afn: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_neg_rsq_f64__afn: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_neg_rsq_f64__afn: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_neg_rsq_f64__afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_neg_rsq_f64__afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn double -1.0, %sqrt ret double %rsq } define double @v_rsq_f64__afn_ninf(double %x) { -; SDAG-LABEL: v_rsq_f64__afn_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_rsq_f64__afn_ninf: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_rsq_f64__afn_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64__afn_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64__afn_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64__afn_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn ninf double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn ninf double 1.0, %sqrt ret double %rsq } define double @v_rsq_f64__afn_nnan(double %x) { -; SDAG-LABEL: v_rsq_f64__afn_nnan: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_rsq_f64__afn_nnan: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_rsq_f64__afn_nnan: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64__afn_nnan: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64__afn_nnan: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64__afn_nnan: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn nnan double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn nnan double 1.0, %sqrt ret double %rsq } define double @v_rsq_f64__afn_nnan_ninf(double %x) { -; SDAG-LABEL: v_rsq_f64__afn_nnan_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_rsq_f64__afn_nnan_ninf: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn nnan ninf double 1.0, %sqrt ret double %rsq } define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { -; SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn nnan ninf double -1.0, %sqrt ret double %rsq @@ -1710,8 +4102,30 @@ ; SI-SDAG-LABEL: v_rsq_f64__nnan_ninf: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -1731,8 +4145,30 @@ ; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -1752,7 +4188,29 @@ ; VI-SDAG-LABEL: v_rsq_f64__nnan_ninf: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1769,7 +4227,29 @@ ; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1788,71 +4268,250 @@ } define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { -; SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 -; SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 -; SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] -; SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 -; SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 -; SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7] -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[0:1], v[8:9] +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[4:5], v[0:1] -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[6:7], v[2:3] -; SI-GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[2:3] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] -; SI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[0:1] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[2:3] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[5:6], v[0:1] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[7:8], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[9:10], -v[0:1], v[5:6], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[11:12], -v[2:3], v[7:8], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[9:10], v[5:6], v[5:6] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[11:12], v[7:8], v[7:8] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[4:5], v[0:1] -; VI-GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[6:7], v[2:3] -; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[2:3] -; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] -; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] -; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] -; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] -; VI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[0:1] -; VI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[2:3] -; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9] -; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) %rsq = fdiv contract afn nnan ninf <2 x double> , %sqrt @@ -1860,34 +4519,155 @@ } define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { -; SDAG-LABEL: s_rsq_f64_unsafe: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; SDAG-NEXT: v_readfirstlane_b32 s1, v1 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: s_rsq_f64_unsafe: -; GISEL: ; %bb.0: -; GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[2:3], s[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] -; GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GISEL-NEXT: v_readfirstlane_b32 s0, v0 -; GISEL-NEXT: v_readfirstlane_b32 s1, v1 -; GISEL-NEXT: ; return to shader part epilog +; SI-SDAG-LABEL: s_rsq_f64_unsafe: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: ; return to shader part epilog +; +; SI-GISEL-LABEL: s_rsq_f64_unsafe: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: s_brev_b32 s3, 8 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; SI-GISEL-NEXT: ; return to shader part epilog +; +; VI-SDAG-LABEL: s_rsq_f64_unsafe: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; VI-SDAG-NEXT: ; return to shader part epilog +; +; VI-GISEL-LABEL: s_rsq_f64_unsafe: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_mov_b32 s2, 0 +; VI-GISEL-NEXT: s_brev_b32 s3, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; VI-GISEL-NEXT: ; return to shader part epilog %rsq = call contract double @llvm.sqrt.f64(double %x) %result = fdiv contract double 1.0, %rsq %cast = bitcast double %result to <2 x i32> @@ -1901,32 +4681,147 @@ } define double @v_rsq_f64_unsafe(double %x) #0 { -; SDAG-LABEL: v_rsq_f64_unsafe: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: v_rsq_f64_unsafe: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] -; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] -; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 -; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_rsq_f64_unsafe: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64_unsafe: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64_unsafe: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64_unsafe: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call double @llvm.sqrt.f64(double %x) %rsq = fdiv double 1.0, %sqrt ret double %rsq @@ -2190,7 +5085,29 @@ ; SI-SDAG-LABEL: v_div_contract_sqrt_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -2210,7 +5127,29 @@ ; SI-GISEL-LABEL: v_div_contract_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -2230,7 +5169,29 @@ ; VI-SDAG-LABEL: v_div_contract_sqrt_f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7 +; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -2247,7 +5208,29 @@ ; VI-GISEL-LABEL: v_div_contract_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -2269,7 +5252,29 @@ ; SI-SDAG-LABEL: v_div_arcp_sqrt_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -2289,7 +5294,29 @@ ; SI-GISEL-LABEL: v_div_arcp_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -2309,7 +5336,29 @@ ; VI-SDAG-LABEL: v_div_arcp_sqrt_f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7 +; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -2326,7 +5375,29 @@ ; VI-GISEL-LABEL: v_div_arcp_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -2348,7 +5419,29 @@ ; SI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -2368,7 +5461,29 @@ ; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: s_mov_b32 s4, 0 +; SI-GISEL-NEXT: s_brev_b32 s5, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -2388,7 +5503,29 @@ ; VI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7 +; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -2405,7 +5542,29 @@ ; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7 +; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -2427,9 +5586,30 @@ ; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_brev_b32 s7, 8 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s7, 0x40700000 +; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 @@ -2449,10 +5629,31 @@ ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] ; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_brev_b32 s7, 8 +; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-GISEL-NEXT: s_mov_b32 s7, 0x40700000 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x40700000 +; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7] ; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], s[6:7], v[0:1], s[6:7] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -2472,9 +5673,30 @@ ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] ; VI-SDAG-NEXT: s_mov_b32 s5, 0x40700000 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5] ; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -2491,9 +5713,30 @@ ; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] ; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-GISEL-NEXT: s_mov_b32 s5, 0x40700000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5] ; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -2514,3 +5757,5 @@ attributes #0 = { "unsafe-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} +; GISEL: {{.*}} +; SDAG: {{.*}}