Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1181,6 +1181,13 @@ const SrcOp &Op0, const SrcOp &Op1, std::optional Flags = std::nullopt); + /// Build and insert a \p Res = G_IS_FPCLASS \p Pred\p Src, \p Mask + MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src, + unsigned Mask) { + return buildInstr(TargetOpcode::G_IS_FPCLASS, {Res}, + {Src, SrcOp(static_cast(Mask))}); + } + /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1 /// /// \pre setBasicBlock or setMI must have been called. Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -146,6 +146,9 @@ bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -907,7 +907,12 @@ .clampScalar(0, S16, S64); if (ST.has16BitInsts()) { - getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) + getActionDefinitionsBuilder(G_FSQRT) + .legalFor({S32, S16}) + .customFor({S64}) + .scalarize(0) + .clampScalar(0, S16, S64); + getActionDefinitionsBuilder(G_FFLOOR) .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); @@ -920,7 +925,8 @@ .lower(); } else { getActionDefinitionsBuilder(G_FSQRT) - .legalFor({S32, S64}) + .legalFor({S32}) + .customFor({S64}) .scalarize(0) .clampScalar(0, S32, S64); @@ -1976,6 +1982,8 @@ return legalizeFMad(MI, MRI, B); case TargetOpcode::G_FDIV: return legalizeFDIV(MI, MRI, B); + case TargetOpcode::G_FSQRT: + return legalizeFSQRT(MI, MRI, B); case TargetOpcode::G_UDIV: case TargetOpcode::G_UREM: case TargetOpcode::G_UDIVREM: @@ -4334,6 +4342,90 @@ return true; } +bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // For double type, the SQRT and RSQ instructions don't have required + // precision, we apply Goldschmidt's algorithm to improve the result: + // + // y0 = rsq(x) + // g0 = x * y0 + // h0 = 0.5 * y0 + // + // r0 = 0.5 - h0 * g0 + // g1 = g0 * r0 + g0 + // h1 = h0 * r0 + h0 + // + // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 + // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 + // h2 = h1 * r1 + h1 + // + // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 + // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 + // + // sqrt(x) = g3 + + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + const LLT F64 = LLT::scalar(64); + + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); + + Register X = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + + auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); + + auto ZeroInt = B.buildConstant(S32, 0); + auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); + + // Scale up input if it is too small. + auto ScaleUpFactor = B.buildConstant(S32, 256); + auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); + auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); + + auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false) + .addReg(SqrtX.getReg(0)); + + auto Half = B.buildFConstant(F64, 0.5); + auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); + auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); + + auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); + auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); + + auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); + auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); + + auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); + auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); + + auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); + + auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); + auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); + + auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); + + // Scale down the result. + auto ScaleDownFactor = B.buildConstant(S32, -128); + auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); + SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); + + // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check + // with finite only or nsz because rsq(+/-0) = +/-inf + + // TODO: Check for DAZ and expand to subnormals + auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); + + // If x is +INF, +0, or -0, use its original value + B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); + + MI.eraseFromParent(); + return true; +} + // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. // FIXME: Why do we handle this one but not other removed instructions? // Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -108,6 +108,7 @@ SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -219,6 +219,8 @@ setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); + setOperationAction(ISD::FSQRT, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); @@ -4820,7 +4822,10 @@ "Load should return a value and a chain"); return Result; } - + case ISD::FSQRT: + if (Op.getValueType() == MVT::f64) + return lowerFSQRTF64(Op, DAG); + return SDValue(); case ISD::FSIN: case ISD::FCOS: return LowerTrig(Op, DAG); @@ -9609,6 +9614,87 @@ return SDValue(); } +SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { + // For double type, the SQRT and RSQ instructions don't have required + // precision, we apply Goldschmidt's algorithm to improve the result: + // + // y0 = rsq(x) + // g0 = x * y0 + // h0 = 0.5 * y0 + // + // r0 = 0.5 - h0 * g0 + // g1 = g0 * r0 + g0 + // h1 = h0 * r0 + h0 + // + // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 + // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 + // h2 = h1 * r1 + h1 + // + // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 + // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 + // + // sqrt(x) = g3 + + SDNodeFlags Flags = Op->getFlags(); + + SDLoc DL(Op); + + SDValue X = Op.getOperand(0); + SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); + + SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); + + SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); + + // Scale up input if it is too small. + SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); + SDValue ScaleUp = + DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); + SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); + + SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); + + SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY); + + SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64); + SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half); + + SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0); + SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half); + + SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0); + + SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0); + + SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1); + SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX); + + SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); + + SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); + SDValue SqrtD1 = + DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); + + SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); + + SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32); + SDValue ScaleDown = + DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); + SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); + + // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check + // with finite only or nsz because rsq(+/-0) = +/-inf + + // TODO: Check for DAZ and expand to subnormals + SDValue IsZeroOrInf = + DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, + DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); + + // If x is +INF, +0, or -0, use its original value + return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet, + Flags); +} + SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -332,7 +332,7 @@ let TRANS = 1, SchedRW = [WriteTrans64] in { defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; -defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>; +defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; } // End TRANS = 1, SchedRW = [WriteTrans64] let TRANS = 1, SchedRW = [WriteTrans32] in { Index: llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll +++ llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll @@ -52,10 +52,10 @@ ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; ALL-SIZE-LABEL: 'fsqrt' @@ -63,10 +63,10 @@ ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %F32 = call float @llvm.sqrt.f32(float undef) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir @@ -1,9 +1,9 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=SI,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefixes=VI,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s --- name: test_fsqrt_s32 @@ -11,24 +11,12 @@ bb.0: liveins: $vgpr0 - ; SI-LABEL: name: test_fsqrt_s32 - ; SI: liveins: $vgpr0 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] - ; SI-NEXT: $vgpr0 = COPY [[FSQRT]](s32) - ; VI-LABEL: name: test_fsqrt_s32 - ; VI: liveins: $vgpr0 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] - ; VI-NEXT: $vgpr0 = COPY [[FSQRT]](s32) - ; GFX9-LABEL: name: test_fsqrt_s32 - ; GFX9: liveins: $vgpr0 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] - ; GFX9-NEXT: $vgpr0 = COPY [[FSQRT]](s32) + ; GCN-LABEL: name: test_fsqrt_s32 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] + ; GCN-NEXT: $vgpr0 = COPY [[FSQRT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FSQRT %0 $vgpr0 = COPY %1 @@ -40,28 +28,82 @@ bb.0: liveins: $vgpr0 - ; SI-LABEL: name: test_fsqrt_s64 - ; SI: liveins: $vgpr0 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]] - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64) - ; VI-LABEL: name: test_fsqrt_s64 - ; VI: liveins: $vgpr0 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]] - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64) - ; GFX9-LABEL: name: test_fsqrt_s64 - ; GFX9: liveins: $vgpr0 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]] - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64) + ; GCN-LABEL: name: test_fsqrt_s64 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]] + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[COPY]], [[SELECT]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64) + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]] + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]] + ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]] + ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]] + ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32) + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608 + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]] + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_FSQRT %0 $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_fsqrt_s64_ninf +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: test_fsqrt_s64_ninf + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]] + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[COPY]], [[SELECT]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64) + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]] + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]] + ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]] + ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]] + ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[FMA6]], [[SELECT1]](s32) + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608 + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = ninf G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]] + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = ninf G_FSQRT %0 + $vgpr0_vgpr1 = COPY %1 + ... --- name: test_fsqrt_s16 @@ -108,33 +150,15 @@ bb.0: liveins: $vgpr0_vgpr1 - ; SI-LABEL: name: test_fsqrt_v2s32 - ; SI: liveins: $vgpr0_vgpr1 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; VI-LABEL: name: test_fsqrt_v2s32 - ; VI: liveins: $vgpr0_vgpr1 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; GFX9-LABEL: name: test_fsqrt_v2s32 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GCN-LABEL: name: test_fsqrt_v2s32 + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] + ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FSQRT %0 $vgpr0_vgpr1 = COPY %1 @@ -146,36 +170,16 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2 - ; SI-LABEL: name: test_fsqrt_v3s32 - ; SI: liveins: $vgpr0_vgpr1_vgpr2 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; SI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) - ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - ; VI-LABEL: name: test_fsqrt_v3s32 - ; VI: liveins: $vgpr0_vgpr1_vgpr2 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; VI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) - ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-LABEL: name: test_fsqrt_v3s32 - ; GFX9: liveins: $vgpr0_vgpr1_vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; GFX9-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; GCN-LABEL: name: test_fsqrt_v3s32 + ; GCN: liveins: $vgpr0_vgpr1_vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] + ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] + ; GCN-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FSQRT %0 $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -187,33 +191,58 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; SI-LABEL: name: test_fsqrt_v2s64 - ; SI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]] - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]] - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64) - ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; VI-LABEL: name: test_fsqrt_v2s64 - ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; VI-NEXT: {{ $}} - ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]] - ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64) - ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; GFX9-LABEL: name: test_fsqrt_v2s64 - ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]] - ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GCN-LABEL: name: test_fsqrt_v2s64 + ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s64), [[C]] + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[UV]], [[SELECT]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64) + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]] + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]] + ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]] + ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]] + ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32) + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608 + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]] + ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s64), [[C]] + ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C1]] + ; GCN-NEXT: [[FLDEXP2:%[0-9]+]]:_(s64) = G_FLDEXP [[UV1]], [[SELECT3]](s32) + ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP2]](s64) + ; GCN-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[INT1]], [[C3]] + ; GCN-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP2]], [[INT1]] + ; GCN-NEXT: [[FNEG3:%[0-9]+]]:_(s64) = G_FNEG [[FMUL2]] + ; GCN-NEXT: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FNEG3]], [[FMUL3]], [[C3]] + ; GCN-NEXT: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FMUL3]], [[FMA7]], [[FMUL3]] + ; GCN-NEXT: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMUL2]], [[FMA7]], [[FMUL2]] + ; GCN-NEXT: [[FNEG4:%[0-9]+]]:_(s64) = G_FNEG [[FMA8]] + ; GCN-NEXT: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG4]], [[FMA8]], [[FLDEXP2]] + ; GCN-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMA8]] + ; GCN-NEXT: [[FNEG5:%[0-9]+]]:_(s64) = G_FNEG [[FMA11]] + ; GCN-NEXT: [[FMA12:%[0-9]+]]:_(s64) = G_FMA [[FNEG5]], [[FMA11]], [[FLDEXP2]] + ; GCN-NEXT: [[FMA13:%[0-9]+]]:_(s64) = G_FMA [[FMA12]], [[FMA9]], [[FMA11]] + ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C1]] + ; GCN-NEXT: [[FLDEXP3:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA13]], [[SELECT4]](s32) + ; GCN-NEXT: [[IS_FPCLASS1:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP2]](s64), 608 + ; GCN-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS1]](s1), [[FLDEXP2]], [[FLDEXP3]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT2]](s64), [[SELECT5]](s64) + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s64>) = G_FSQRT %0 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 Index: llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1,48 +1,248 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s define double @v_sqrt_f64(double %x) { -; GCN-LABEL: v_sqrt_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_fneg(double %x) { -; GCN-LABEL: v_sqrt_f64_fneg: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fneg: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 9 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fneg: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %x.neg = fneg double %x %result = call double @llvm.sqrt.f64(double %x.neg) ret double %result } define double @v_sqrt_f64_fabs(double %x) { -; GCN-LABEL: v_sqrt_f64_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fabs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fabs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call double @llvm.fabs.f64(double %x) %result = call double @llvm.sqrt.f64(double %x.fabs) ret double %result } define double @v_sqrt_f64_fneg_fabs(double %x) { -; GCN-LABEL: v_sqrt_f64_fneg_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fneg_fabs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 9 +; SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fneg_fabs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call double @llvm.fabs.f64(double %x) %x.fabs.neg = fneg double %x.fabs %result = call double @llvm.sqrt.f64(double %x.fabs.neg) @@ -50,42 +250,245 @@ } define double @v_sqrt_f64_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call ninf double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" { -; GCN-LABEL: v_sqrt_f64_no_infs_attribute: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_no_infs_attribute: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_no_infs_attribute: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call ninf double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_nnan(double %x) { -; GCN-LABEL: v_sqrt_f64_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nnan: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nnan: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nnan double @llvm.sqrt.f64(double %x) ret double %result } define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) { -; GCN-LABEL: s_sqrt_f64: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -98,12 +501,65 @@ } define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) { -; GCN-LABEL: s_sqrt_f64_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call ninf double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -116,12 +572,65 @@ } define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) { -; GCN-LABEL: s_sqrt_f64_afn: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64_afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64_afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call afn double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -134,12 +643,65 @@ } define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) { -; GCN-LABEL: s_sqrt_f64_afn_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: ; return to shader part epilog +; SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 +; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, 0 +; GISEL-NEXT: s_brev_b32 s3, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog %result = call afn nnan ninf double @llvm.sqrt.f64(double %x) %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 @@ -152,167 +714,1147 @@ } define double @v_sqrt_f64_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_nnan_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nnan ninf double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_nnan_ninf_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_nnan_ninf_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nnan ninf nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_afn(double %x) { -; GCN-LABEL: v_sqrt_f64_afn: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_afn_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nsz double @llvm.sqrt.f64(double %x) ret double %result } define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { -; GCN-LABEL: v_sqrt_v2f64_afn: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v2f64_afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5] +; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v2f64_afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) ret <2 x double> %result } define double @v_sqrt_f64_afn_nnan(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nnan: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nnan: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64_fabs_afn_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_fabs_afn_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %fabs = call double @llvm.fabs.f64(double %x) %result = call afn ninf double @llvm.sqrt.f64(double %fabs) ret double %result } define double @v_sqrt_f64_afn_nnan_ninf(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan ninf double @llvm.sqrt.f64(double %x) ret double %result } define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { -; GCN-LABEL: v_sqrt_v2f64_afn_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5] +; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) ret <2 x double> %result } define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) { -; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 { -; GCN-LABEL: v_sqrt_f64__approx_func_fp_math: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 { -; GCN-LABEL: v_sqrt_f64__enough_unsafe_attrs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define double @v_sqrt_f64__unsafe_attr(double %x) #4 { -; GCN-LABEL: v_sqrt_f64__unsafe_attr: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_f64__unsafe_attr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] +; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_f64__unsafe_attr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 +; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call nsz double @llvm.sqrt.f64(double %x) ret double %result } define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { -; GCN-LABEL: v_sqrt_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v2f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s4, 0 +; SDAG-NEXT: s_brev_b32 s5, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5] +; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v2f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5 +; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1] +; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) ret <2 x double> %result } define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { -; GCN-LABEL: v_sqrt_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: v_sqrt_f64_e32 v[4:5], v[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: v_sqrt_v3f64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s6, 0 +; SDAG-NEXT: s_brev_b32 s7, 8 +; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] +; SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] +; SDAG-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] +; SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[6:7] +; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9] +; SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 +; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[10:11] +; SDAG-NEXT: v_mul_f64 v[10:11], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5 +; SDAG-NEXT: v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5 +; SDAG-NEXT: v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5 +; SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] +; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] +; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13] +; SDAG-NEXT: v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15] +; SDAG-NEXT: v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] +; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v13, 0x260 +; SDAG-NEXT: v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15] +; SDAG-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] +; SDAG-NEXT: v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 +; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 +; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 +; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sqrt_v3f64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s6, 0 +; GISEL-NEXT: s_brev_b32 s7, 8 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] +; GISEL-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[6:7] +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v7 +; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] +; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] +; GISEL-NEXT: v_mul_f64 v[12:13], v[6:7], 0.5 +; GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] +; GISEL-NEXT: v_mul_f64 v[14:15], v[8:9], 0.5 +; GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9] +; GISEL-NEXT: v_mul_f64 v[16:17], v[10:11], 0.5 +; GISEL-NEXT: v_mul_f64 v[10:11], v[4:5], v[10:11] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5 +; GISEL-NEXT: v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5 +; GISEL-NEXT: v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5 +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] +; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] +; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] +; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] +; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] +; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] +; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] +; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] +; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] +; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc +; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 +; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 +; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 +; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x) ret <3 x double> %result } @@ -414,5 +1956,4 @@ attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" } attributes #4 = { "unsafe-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GISEL: {{.*}} -; SDAG: {{.*}} +; GCN: {{.*}} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -3,6 +3,7 @@ declare float @llvm.amdgcn.rcp.f32(float) #0 declare double @llvm.amdgcn.rcp.f64(double) #0 +declare double @llvm.amdgcn.sqrt.f64(double) #0 declare double @llvm.sqrt.f64(double) #0 declare float @llvm.sqrt.f32(float) #0 @@ -124,7 +125,15 @@ ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f64: ; SI-NOT: v_rsq_f64_e32 -; SI: v_sqrt_f64 +; SI: v_rsq_f64 +; SI: v_mul_f64 +; SI: v_mul_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 ; SI: v_rcp_f64 define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 { %sqrt = call double @llvm.sqrt.f64(double %src) @@ -133,12 +142,42 @@ ret void } +; FUNC-LABEL: {{^}}safe_amdgcn_sqrt_rsq_rcp_pat_f64: +; SI-NOT: v_rsq_f64_e32 +; SI: v_sqrt_f64 +; SI: v_rcp_f64 +define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { + %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src) + %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f64: +; SI: v_rsq_f64 +; SI: v_mul_f64 +; SI: v_mul_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_rcp_f64 +; SI: buffer_store_dwordx2 +define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { + %sqrt = call double @llvm.sqrt.f64(double %src) + %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}unsafe_amdgcn_sqrt_rsq_rcp_pat_f64: ; SI: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SQRT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 { - %sqrt = call double @llvm.sqrt.f64(double %src) +define amdgpu_kernel void @unsafe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 { + %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) store double %rcp, ptr addrspace(1) %out, align 8 ret void Index: llvm/test/CodeGen/AMDGPU/rsq.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.ll +++ llvm/test/CodeGen/AMDGPU/rsq.ll @@ -4,6 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.sqrt.f32(float) nounwind readnone declare double @llvm.sqrt.f64(double) nounwind readnone +declare double @llvm.amdgcn.sqrt.f64(double) nounwind readnone ; SI-LABEL: {{^}}rsq_f32: ; SI: v_rsq_f32_e32 @@ -18,6 +19,8 @@ ; SI-LABEL: {{^}}rsq_f64: ; SI: v_rsq_f64 +; SI: v_mul_f64 +; SI: v_mul_f64 ; SI: v_fma_f64 ; SI: v_fma_f64 ; SI: v_fma_f64 @@ -34,6 +37,18 @@ ret void } +; SI-LABEL: {{^}}amdgcn_sqrt_rsq_f64: +; SI: v_sqrt_f64 +; SI: v_rcp_f64 +; SI: s_endpgm +define amdgpu_kernel void @amdgcn_sqrt_rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { + %val = load double, ptr addrspace(1) %in, align 4 + %sqrt = call double @llvm.amdgcn.sqrt.f64(double %val) nounwind readnone + %div = fdiv double 1.0, %sqrt + store double %div, ptr addrspace(1) %out, align 4 + ret void +} + ; SI-LABEL: {{^}}rsq_f32_sgpr: ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} ; SI: s_endpgm @@ -97,20 +112,51 @@ ret void } +; SI-LABEL: {{^}}amdgcn_sqrt_neg_rsq_f64: +; SI-SAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; SI-SAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; SI-SAFE: v_div_scale_f64 +; SI-SAFE: v_div_scale_f64 + +; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; SI-UNSAFE: v_rcp_f64_e32 [[RCP:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RCP]], 1.0 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +define amdgpu_kernel void @amdgcn_sqrt_neg_rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { + %val = load double, ptr addrspace(1) %in, align 4 + %sqrt = call double @llvm.amdgcn.sqrt.f64(double %val) + %div = fdiv double -1.0, %sqrt + store double %div, ptr addrspace(1) %out, align 4 + ret void +} + ; SI-LABEL: {{^}}neg_rsq_f64: -; SI: v_rsq_f64_e32 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_div_scale_f64 -; SI: v_div_scale_f64 -; SI: v_rcp_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 +; SI-SAFE: v_rsq_f64_e32 +; SI-SAFE: v_mul_f64 +; SI-SAFE: v_fma_f64 +; SI-SAFE: v_fma_f64 +; SI-SAFE: v_fma_f64 +; SI-SAFE: v_fma_f64 +; SI-SAFE: v_fma_f64 +; SI-SAFE: v_fma_f64 +; SI-SAFE: v_fma_f64 + +; SI-SAFE: v_div_scale_f64 + +; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; SI-UNSAFE: v_rcp_f64_e32 [[RCP:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RCP]], 1.0 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 define amdgpu_kernel void @neg_rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { %val = load double, ptr addrspace(1) %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) @@ -138,17 +184,24 @@ ; SI-LABEL: {{^}}neg_rsq_neg_f64: ; SI-SAFE: v_rsq_f64_e64 v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} +; SI-SAFE: v_mul_f64 +; SI-SAFE: v_fma_f64 ; SI-SAFE: v_div_scale_f64 -; SI-UNSAFE: v_rsq_f64_e32 +; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; SI-UNSAFE-DAG: v_rsq_f64_e64 [[RSQ:v\[[0-9]+:[0-9]+\]]], -[[VAL]] +; SI-UNSAFE: v_mul_f64 +; SI-UNSAFE: v_fma_f64 ; SI-UNSAFE: v_fma_f64 ; SI-UNSAFE: v_fma_f64 ; SI-UNSAFE: v_fma_f64 ; SI-UNSAFE: v_fma_f64 ; SI-UNSAFE: v_fma_f64 - -; SI-UNSAFE: v_rcp_f64_e32 ; SI-UNSAFE: v_fma_f64 + + +; SI-UNSAFE: v_rcp_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], +; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0 ; SI-UNSAFE: v_fma_f64 ; SI-UNSAFE: v_fma_f64 ; SI-UNSAFE: v_fma_f64 @@ -163,6 +216,28 @@ ret void } +; SI-LABEL: {{^}}amdgcn_sqrt_neg_rsq_neg_f64: +; SI-SAFE: v_sqrt_f64_e64 v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} +; SI-SAFE: v_div_scale_f64 + +; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; SI-UNSAFE-DAG: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -[[VAL]] +; SI-UNSAFE: v_rcp_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], [[SQRT]] +; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +define amdgpu_kernel void @amdgcn_sqrt_neg_rsq_neg_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { + %val = load double, ptr addrspace(1) %in, align 4 + %val.fneg = fsub double -0.0, %val + %sqrt = call double @llvm.amdgcn.sqrt.f64(double %val.fneg) + %div = fdiv double -1.0, %sqrt + store double %div, ptr addrspace(1) %out, align 4 + ret void +} + !0 = !{float 2.500000e+00} attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }