Index: llvm/docs/ReleaseNotes.rst =================================================================== --- llvm/docs/ReleaseNotes.rst +++ llvm/docs/ReleaseNotes.rst @@ -71,7 +71,7 @@ Changes to the AMDGPU Backend ----------------------------- -* `llvm.sqrt.f64` is now lowered correctly. Use `llvm.amdgcn.sqrt.f64` +* `llvm.sqrt.f32` is now lowered correctly. Use `llvm.amdgcn.sqrt.f32` for raw instruction access. * Implemented `llvm.stacksave` and `llvm.stackrestore` intrinsics. Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -108,9 +108,31 @@ bool HasUnsafeFPMath = false; bool HasFP32DenormalFlush = false; bool FlowChanged = false; + mutable Function *SqrtF32 = nullptr; + mutable Function *LdexpF32 = nullptr; DenseMap BreakPhiNodesCache; + Function *getSqrtF32() const { + if (SqrtF32) + return SqrtF32; + + LLVMContext &Ctx = Mod->getContext(); + SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt, + {Type::getFloatTy(Ctx)}); + return SqrtF32; + } + + Function *getLdexpF32() const { + if (LdexpF32) + return LdexpF32; + + LLVMContext &Ctx = Mod->getContext(); + LdexpF32 = Intrinsic::getDeclaration( + Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)}); + return LdexpF32; + } + bool canBreakPHINode(const PHINode &I); /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to @@ -276,6 +298,8 @@ bool IsNegative) const; Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS, FastMathFlags FMF) const; + Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src, + FastMathFlags FMF) const; public: bool visitFDiv(BinaryOperator &I); @@ -290,6 +314,7 @@ bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool visitMinNum(IntrinsicInst &I); + bool visitSqrt(IntrinsicInst &I); bool run(Function &F); }; @@ -810,14 +835,10 @@ // range won't underflow to denormal. The hard part is knowing the // result. We need a range check, the result could be denormal for // 0x1p+126 < den <= 0x1p+127. - - Type *Ty = Src->getType(); - auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src); Value *ScaleFactor = Builder.CreateNeg(FrexpExp); Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant); - return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, - {Rcp, ScaleFactor}); + return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor}); } /// Emit a 2ulp expansion for fdiv by using frexp for input scaling. @@ -833,8 +854,6 @@ // We're scaling the LHS to avoid a denormal input, and scale the denominator // to avoid large values underflowing the result. - Type *Ty = LHS->getType(); - auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS); Value *Rcp = @@ -846,8 +865,30 @@ // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the // result. Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS); - return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, - {Mul, ExpDiff}); + return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff}); +} + +/// Emit a sqrt that handles denormals and is accurate to 2ulp. +Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder, + Value *Src, + FastMathFlags FMF) const { + Type *Ty = Src->getType(); + APFloat SmallestNormal = + APFloat::getSmallestNormalized(Ty->getFltSemantics()); + Value *NeedScale = + Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal)); + + ConstantInt *Zero = Builder.getInt32(0); + Value *InputScaleFactor = + Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero); + + Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor}); + + Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled); + + Value *OutputScaleFactor = + Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero); + return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor}); } /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals. @@ -2010,6 +2051,8 @@ return visitBitreverseIntrinsicInst(I); case Intrinsic::minnum: return visitMinNum(I); + case Intrinsic::sqrt: + return visitSqrt(I); default: return false; } @@ -2103,9 +2146,76 @@ return true; } +static bool isOneOrNegOne(const Value *Val) { + const APFloat *C; + return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; +} + +// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way. +bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { + Type *Ty = Sqrt.getType()->getScalarType(); + if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts())) + return false; + + const FPMathOperator *FPOp = cast(&Sqrt); + FastMathFlags SqrtFMF = FPOp->getFastMathFlags(); + + // We're trying to handle the fast-but-not-that-fast case only. The lowering + // of fast llvm.sqrt will give the raw instruction anyway. + if (SqrtFMF.approxFunc() || HasUnsafeFPMath) + return false; + + const float ReqdAccuracy = FPOp->getFPAccuracy(); + + // Defer correctly rounded expansion to codegen. + if (ReqdAccuracy < 1.0f) + return false; + + // FIXME: This is an ugly hack for this pass using forward iteration instead + // of reverse. If it worked like a normal combiner, the rsq would form before + // we saw a sqrt call. + auto *FDiv = + dyn_cast_or_null(Sqrt.getUniqueUndroppableUser()); + if (FDiv && FDiv->getOpcode() == Instruction::FDiv && + FDiv->getFPAccuracy() >= 1.0f && + canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && + // TODO: We should also handle the arcp case for the fdiv with non-1 value + isOneOrNegOne(FDiv->getOperand(0))) + return false; + + IRBuilder<> Builder(&Sqrt); + Value *SrcVal = Sqrt.getOperand(0); + + SmallVector SrcVals; + extractValues(Builder, SrcVals, SrcVal); + + bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt); + + // The raw instruction is 1 ulp, but the correction for denormal handling + // brings it to 2. + if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f) + return false; + + SmallVector ResultVals(SrcVals.size()); + for (int I = 0, E = SrcVals.size(); I != E; ++I) { + if (CanTreatAsDAZ) + ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]); + else + ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF); + } + + Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals); + NewSqrt->takeName(&Sqrt); + Sqrt.replaceAllUsesWith(NewSqrt); + Sqrt.eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Impl.Mod = &M; Impl.DL = &Impl.Mod->getDataLayout(); + Impl.SqrtF32 = nullptr; + Impl.LdexpF32 = nullptr; return false; } Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -61,6 +61,9 @@ SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags); + static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, + SDNodeFlags Flags); SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const; SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2477,15 +2477,17 @@ llvm_unreachable("covered opcode switch"); } -static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) { +bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, + SDNodeFlags Flags) { if (Flags.hasApproximateFuncs()) return true; auto &Options = DAG.getTarget().Options; return Options.UnsafeFPMath || Options.ApproxFuncFPMath; } -static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, - SDNodeFlags Flags) { +bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, + SDValue Src, + SDNodeFlags Flags) { return !valueIsKnownNeverF32Denorm(Src) && DAG.getMachineFunction() .getDenormalMode(APFloat::IEEEsingle()) Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -157,6 +157,12 @@ bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -914,10 +914,10 @@ if (ST.has16BitInsts()) { getActionDefinitionsBuilder(G_FSQRT) - .legalFor({S32, S16}) - .customFor({S64}) + .legalFor({S16}) + .customFor({S32, S64}) .scalarize(0) - .clampScalar(0, S16, S64); + .unsupported(); getActionDefinitionsBuilder(G_FFLOOR) .legalFor({S32, S64, S16}) .scalarize(0) @@ -936,10 +936,10 @@ .lower(); } else { getActionDefinitionsBuilder(G_FSQRT) - .legalFor({S32}) - .customFor({S64}) + .customFor({S32, S64, S16}) .scalarize(0) - .clampScalar(0, S32, S64); + .unsupported(); + if (ST.hasFractBug()) { getActionDefinitionsBuilder(G_FFLOOR) @@ -4883,9 +4883,107 @@ return true; } -bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // Bypass the correct expansion a standard promotion through G_FSQRT would + // get. The f32 op is accurate enough for the f16 cas. + unsigned Flags = MI.getFlags(); + assert(!ST.has16BitInsts()); + const LLT F32 = LLT::scalar(32); + auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); + B.buildFPTrunc(MI.getOperand(0), Log2, Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + const unsigned Flags = MI.getFlags(); + const LLT S1 = LLT::scalar(1); + const LLT F32 = LLT::scalar(32); + const LLT I32 = LLT::scalar(32); + + if (allowApproxFunc(MF, Flags)) { + B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef({Dst})) + .addUse(X) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; + } + + auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); + auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); + auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); + auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); + auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); + + Register SqrtS = MRI.createGenericVirtualRegister(F32); + if (needsDenormHandlingF32(MF, X, Flags)) { + B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef({SqrtS})) + .addUse(SqrtX.getReg(0)) + .setMIFlags(Flags); + + auto NegOne = B.buildConstant(I32, -1); + auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); + + auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); + auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); + + auto PosOne = B.buildConstant(I32, 1); + auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); + + auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); + auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); + + auto Zero = B.buildFConstant(F32, 0.0f); + auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); + + SqrtS = + B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); + + auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); + SqrtS = + B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); + } else { + auto SqrtR = + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); + B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); + + auto Half = B.buildFConstant(F32, 0.5f); + auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); + auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); + auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); + SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); + SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); + auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); + auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); + SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); + } + + auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); + + auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); + + SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); + + auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); + B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { // For double type, the SQRT and RSQ instructions don't have required // precision, we apply Goldschmidt's algorithm to improve the result: // @@ -4967,6 +5065,19 @@ return true; } +bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if (Ty == LLT::scalar(32)) + return legalizeFSQRTF32(MI, MRI, B); + if (Ty == LLT::scalar(64)) + return legalizeFSQRTF64(MI, MRI, B); + if (Ty == LLT::scalar(16)) + return legalizeFSQRTF16(MI, MRI, B); + return false; +} + // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. // FIXME: Why do we handle this one but not other removed instructions? // Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -109,6 +109,8 @@ SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -220,7 +220,7 @@ setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::FSQRT, MVT::f64, Custom); + setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom); setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); @@ -421,6 +421,8 @@ if (Subtarget->has16BitInsts()) { setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); + } else { + setOperationAction(ISD::FSQRT, MVT::f16, Custom); } if (Subtarget->hasMadMacF32Insts()) @@ -4947,10 +4949,14 @@ "Load should return a value and a chain"); return Result; } - case ISD::FSQRT: - if (Op.getValueType() == MVT::f64) + case ISD::FSQRT: { + EVT VT = Op.getValueType(); + if (VT == MVT::f32) + return lowerFSQRTF32(Op, DAG); + if (VT == MVT::f64) return lowerFSQRTF64(Op, DAG); return SDValue(); + } case ISD::FSIN: case ISD::FCOS: return LowerTrig(Op, DAG); @@ -5405,6 +5411,12 @@ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); return; } + case ISD::FSQRT: { + if (N->getValueType(0) != MVT::f16) + break; + Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); + break; + } default: AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); break; @@ -9772,6 +9784,111 @@ return SDValue(); } +// Avoid the full correct expansion for f32 sqrt when promoting from f16. +SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + assert(!Subtarget->has16BitInsts()); + SDNodeFlags Flags = Op->getFlags(); + SDValue Ext = + DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); + + SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); + SDValue Sqrt = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); + + return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); +} + +SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDNodeFlags Flags = Op->getFlags(); + MVT VT = Op.getValueType().getSimpleVT(); + const SDValue X = Op.getOperand(0); + + if (allowApproxFunc(DAG, Flags)) { + // Instruction is 1ulp but ignores denormals. + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); + } + + SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); + SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); + + SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT); + + SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags); + + SDValue SqrtX = + DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags); + + SDValue SqrtS; + if (needsDenormHandlingF32(DAG, X, Flags)) { + SDValue SqrtID = + DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); + SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); + + SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); + SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, + DAG.getConstant(-1, DL, MVT::i32)); + SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); + + SDValue NegSqrtSNextDown = + DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags); + + SDValue SqrtVP = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags); + + SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, + DAG.getConstant(1, DL, MVT::i32)); + SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt); + + SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags); + SDValue SqrtVS = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags); + + SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); + SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); + + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS, + Flags); + + SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS, + Flags); + } else { + SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags); + + SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags); + + SDValue Half = DAG.getConstantFP(0.5f, DL, VT); + SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags); + SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags); + + SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags); + SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags); + SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags); + + SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags); + SDValue SqrtD = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags); + SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags); + } + + SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); + + SDValue ScaledDown = + DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags); + + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags); + SDValue IsZeroOrInf = + DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, + DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); + + return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags); +} + SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { // For double type, the SQRT and RSQ instructions don't have required // precision, we apply Goldschmidt's algorithm to improve the result: Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -326,7 +326,7 @@ defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; -defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>; +defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, int_amdgcn_sqrt>; } // End TRANS = 1, SchedRW = [WriteTrans32] let TRANS = 1, SchedRW = [WriteTrans64] in { Index: llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll +++ llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll @@ -48,10 +48,10 @@ define i32 @fsqrt(i32 %arg) { ; ALL-LABEL: 'fsqrt' -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.sqrt.f32(float undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) -; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) +; ALL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) @@ -59,10 +59,10 @@ ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; ALL-SIZE-LABEL: 'fsqrt' -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.sqrt.f32(float undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll @@ -4,7 +4,22 @@ define amdgpu_cs float @div_sqrt(float inreg %arg1) { ; GCN-LABEL: div_sqrt: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_sqrt_f32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: @@ -17,7 +32,22 @@ ; GCN-LABEL: sqrt_div: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_rcp_f32_e32 v0, s0 -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GCN-NEXT: ; return to shader part epilog .entry: %a = fdiv afn float 1.000000e+00, %arg1 @@ -28,7 +58,22 @@ define amdgpu_cs float @rcp_sqrt(float inreg %arg1) { ; GCN-LABEL: rcp_sqrt: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_sqrt_f32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: @@ -41,7 +86,22 @@ ; GCN-LABEL: sqrt_rcp: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_rcp_f32_e32 v0, s0 -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GCN-NEXT: ; return to shader part epilog .entry: %a = call float @llvm.amdgcn.rcp.f32(float %arg1) @@ -52,7 +112,23 @@ define amdgpu_cs float @div_sqrt_contract(float inreg %arg1) { ; GCN-LABEL: div_sqrt_contract: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: %a = call contract float @llvm.sqrt.f32(float %arg1) @@ -63,7 +139,23 @@ define amdgpu_cs float @sqrt_div_contract(float inreg %arg1) { ; GCN-LABEL: sqrt_div_contract: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GCN-NEXT: ; return to shader part epilog .entry: %a = fdiv afn contract float 1.000000e+00, %arg1 @@ -74,7 +166,23 @@ define amdgpu_cs float @rcp_sqrt_contract(float inreg %arg1) { ; GCN-LABEL: rcp_sqrt_contract: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: %a = call contract float @llvm.sqrt.f32(float %arg1) @@ -85,7 +193,23 @@ define amdgpu_cs float @sqrt_rcp_contract(float inreg %arg1) { ; GCN-LABEL: sqrt_rcp_contract: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GCN-NEXT: v_sqrt_f32_e32 v1, v0 +; GCN-NEXT: v_add_nc_u32_e32 v2, -1, v1 +; GCN-NEXT: v_add_nc_u32_e32 v3, 1, v1 +; GCN-NEXT: v_fma_f32 v4, -v2, v1, v0 +; GCN-NEXT: v_fma_f32 v5, -v3, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 +; GCN-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GCN-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GCN-NEXT: ; return to shader part epilog .entry: %a = call contract float @llvm.amdgcn.rcp.f32(float %arg1) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir @@ -15,8 +15,31 @@ ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] - ; GCN-NEXT: $vgpr0 = COPY [[FSQRT]](s32) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x39F0000000000000 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[C]](s32), [[COPY]] + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C1]] + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMUL]], [[COPY]] + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[SELECT]](s32) + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[C2]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[ADD]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT]], [[SELECT]] + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GCN-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[C3]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[ADD1]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[INT]], [[SELECT]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ole), [[FMA]](s32), [[C4]] + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[ADD]], [[INT]] + ; GCN-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[FMA1]](s32), [[C4]] + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[ADD1]], [[SELECT1]] + ; GCN-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3EF0000000000000 + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[SELECT2]], [[C5]] + ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMUL1]], [[SELECT2]] + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[SELECT]](s32), 608 + ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[IS_FPCLASS]](s1), [[SELECT]], [[SELECT3]] + ; GCN-NEXT: $vgpr0 = COPY [[SELECT4]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FSQRT %0 $vgpr0 = COPY %1 @@ -117,10 +140,11 @@ ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT]] - ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT]](s32) + ; SI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT]](s32) + ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; VI-LABEL: name: test_fsqrt_s16 ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} @@ -129,6 +153,7 @@ ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC]] ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT]](s16) ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-LABEL: name: test_fsqrt_s16 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -155,9 +180,49 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x39F0000000000000 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[C]](s32), [[UV]] + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C1]] + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMUL]], [[UV]] + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[SELECT]](s32) + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[C2]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[ADD]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT]], [[SELECT]] + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GCN-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[C3]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[ADD1]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[INT]], [[SELECT]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ole), [[FMA]](s32), [[C4]] + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[ADD]], [[INT]] + ; GCN-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[FMA1]](s32), [[C4]] + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[ADD1]], [[SELECT1]] + ; GCN-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3EF0000000000000 + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[SELECT2]], [[C5]] + ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMUL1]], [[SELECT2]] + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[SELECT]](s32), 608 + ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[IS_FPCLASS]](s1), [[SELECT]], [[SELECT3]] + ; GCN-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[C]](s32), [[UV1]] + ; GCN-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C1]] + ; GCN-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[FMUL2]], [[UV1]] + ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[SELECT5]](s32) + ; GCN-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[INT1]], [[C2]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[ADD2]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[INT1]], [[SELECT5]] + ; GCN-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[INT1]], [[C3]] + ; GCN-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[ADD3]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[INT1]], [[SELECT5]] + ; GCN-NEXT: [[FCMP4:%[0-9]+]]:_(s1) = G_FCMP floatpred(ole), [[FMA2]](s32), [[C4]] + ; GCN-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[FCMP4]](s1), [[ADD2]], [[INT1]] + ; GCN-NEXT: [[FCMP5:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[FMA3]](s32), [[C4]] + ; GCN-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[FCMP5]](s1), [[ADD3]], [[SELECT6]] + ; GCN-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[SELECT7]], [[C5]] + ; GCN-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[FMUL3]], [[SELECT7]] + ; GCN-NEXT: [[IS_FPCLASS1:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[SELECT5]](s32), 608 + ; GCN-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[IS_FPCLASS1]](s1), [[SELECT5]], [[SELECT8]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT4]](s32), [[SELECT9]](s32) ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FSQRT %0 @@ -175,10 +240,67 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]] - ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]] - ; GCN-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]] - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x39F0000000000000 + ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[C]](s32), [[UV]] + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C1]] + ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMUL]], [[UV]] + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[SELECT]](s32) + ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[C2]] + ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[ADD]] + ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT]], [[SELECT]] + ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GCN-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[C3]] + ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[ADD1]] + ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[INT]], [[SELECT]] + ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ole), [[FMA]](s32), [[C4]] + ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[ADD]], [[INT]] + ; GCN-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[FMA1]](s32), [[C4]] + ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[ADD1]], [[SELECT1]] + ; GCN-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3EF0000000000000 + ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[SELECT2]], [[C5]] + ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMUL1]], [[SELECT2]] + ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[SELECT]](s32), 608 + ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[IS_FPCLASS]](s1), [[SELECT]], [[SELECT3]] + ; GCN-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[C]](s32), [[UV1]] + ; GCN-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C1]] + ; GCN-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[FMUL2]], [[UV1]] + ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[SELECT5]](s32) + ; GCN-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[INT1]], [[C2]] + ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[ADD2]] + ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[INT1]], [[SELECT5]] + ; GCN-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[INT1]], [[C3]] + ; GCN-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[ADD3]] + ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[INT1]], [[SELECT5]] + ; GCN-NEXT: [[FCMP4:%[0-9]+]]:_(s1) = G_FCMP floatpred(ole), [[FMA2]](s32), [[C4]] + ; GCN-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[FCMP4]](s1), [[ADD2]], [[INT1]] + ; GCN-NEXT: [[FCMP5:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[FMA3]](s32), [[C4]] + ; GCN-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[FCMP5]](s1), [[ADD3]], [[SELECT6]] + ; GCN-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[SELECT7]], [[C5]] + ; GCN-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[FMUL3]], [[SELECT7]] + ; GCN-NEXT: [[IS_FPCLASS1:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[SELECT5]](s32), 608 + ; GCN-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[IS_FPCLASS1]](s1), [[SELECT5]], [[SELECT8]] + ; GCN-NEXT: [[FCMP6:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[C]](s32), [[UV2]] + ; GCN-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[C1]] + ; GCN-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[FCMP6]](s1), [[FMUL4]], [[UV2]] + ; GCN-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[SELECT10]](s32) + ; GCN-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[INT2]], [[C2]] + ; GCN-NEXT: [[FNEG4:%[0-9]+]]:_(s32) = G_FNEG [[ADD4]] + ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FNEG4]], [[INT2]], [[SELECT10]] + ; GCN-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[INT2]], [[C3]] + ; GCN-NEXT: [[FNEG5:%[0-9]+]]:_(s32) = G_FNEG [[ADD5]] + ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG5]], [[INT2]], [[SELECT10]] + ; GCN-NEXT: [[FCMP7:%[0-9]+]]:_(s1) = G_FCMP floatpred(ole), [[FMA4]](s32), [[C4]] + ; GCN-NEXT: [[SELECT11:%[0-9]+]]:_(s32) = G_SELECT [[FCMP7]](s1), [[ADD4]], [[INT2]] + ; GCN-NEXT: [[FCMP8:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[FMA5]](s32), [[C4]] + ; GCN-NEXT: [[SELECT12:%[0-9]+]]:_(s32) = G_SELECT [[FCMP8]](s1), [[ADD5]], [[SELECT11]] + ; GCN-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[SELECT12]], [[C5]] + ; GCN-NEXT: [[SELECT13:%[0-9]+]]:_(s32) = G_SELECT [[FCMP6]](s1), [[FMUL5]], [[SELECT12]] + ; GCN-NEXT: [[IS_FPCLASS2:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[SELECT10]](s32), 608 + ; GCN-NEXT: [[SELECT14:%[0-9]+]]:_(s32) = G_SELECT [[IS_FPCLASS2]](s1), [[SELECT10]], [[SELECT13]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SELECT4]](s32), [[SELECT9]](s32), [[SELECT14]](s32) ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FSQRT %0 @@ -264,17 +386,18 @@ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT]] - ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT]](s32) + ; SI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT]](s32) + ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT1]] - ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT1]](s32) + ; SI-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT1]](s32) + ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) + ; ; VI-LABEL: name: test_fsqrt_v2s16 ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} @@ -292,6 +415,7 @@ ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) + ; ; GFX9-LABEL: name: test_fsqrt_v2s16 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -326,19 +450,20 @@ ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT]] - ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT]](s32) + ; SI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT]](s32) + ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT1]] - ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT1]](s32) + ; SI-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT1]](s32) + ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; SI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT2]] - ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT2]](s32) + ; SI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT2]](s32) + ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT2]](s32) ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC1]](s16) ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC2]](s16) ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; ; VI-LABEL: name: test_fsqrt_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) @@ -357,6 +482,7 @@ ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT2]](s16) ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; ; GFX9-LABEL: name: test_fsqrt_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) @@ -402,17 +528,17 @@ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT]] - ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT]](s32) + ; SI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT]](s32) + ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT1]] - ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT1]](s32) + ; SI-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT1]](s32) + ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; SI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT2]] - ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT2]](s32) + ; SI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT2]](s32) + ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT2]](s32) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; SI-NEXT: [[FSQRT3:%[0-9]+]]:_(s32) = G_FSQRT [[FPEXT3]] - ; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FSQRT3]](s32) + ; SI-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[FPEXT3]](s32) + ; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT3]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) @@ -425,6 +551,7 @@ ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; VI-LABEL: name: test_fsqrt_v4s16 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} @@ -455,6 +582,7 @@ ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; GFX9-LABEL: name: test_fsqrt_v4s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -1579,7 +1579,12 @@ ; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) ; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP37]]) +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP38]]) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = select i1 [[TMP36]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP39]], i32 [[TMP40]]) ; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_MISMATCH_MD2]] ; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD2]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: ret void @@ -1655,7 +1660,12 @@ ; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) ; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) ; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP37]]) +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP38]]) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = select i1 [[TMP36]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP39]], i32 [[TMP40]]) ; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_MISMATCH_MD2]] ; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD2]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: ret void @@ -1667,7 +1677,7 @@ ; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_1ULP:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; DAZ-NEXT: [[SQRT_MD_1ULP_MULTI_USE:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[SQRT_MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MD_1ULP_MULTI_USE]]) ; DAZ-NEXT: store volatile float [[MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 @@ -1697,7 +1707,7 @@ ; DAZ-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD1]]) ; DAZ-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD1]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 +; DAZ-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_MISMATCH_MD2]] ; DAZ-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD2]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void @@ -1885,7 +1895,7 @@ ; ; DAZ-LABEL: define float @rsq_f32_missing_contract0 ; DAZ-SAME: (float [[X:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP]]) ; DAZ-NEXT: ret float [[FDIV_OPENCL]] ; @@ -1919,7 +1929,7 @@ ; ; DAZ-LABEL: define float @rsq_f32_missing_contract1 ; DAZ-SAME: (float [[X:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP]]) ; DAZ-NEXT: ret float [[FDIV_OPENCL]] ; @@ -2126,49 +2136,51 @@ ; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 ; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[X]], i64 1 ; IEEE-GOODFREXP-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP20]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP22]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP20]], [[TMP23]] -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP24]]) -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = select contract i1 [[TMP22]], float 4.096000e+03, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = fmul contract float [[TMP25]], [[TMP26]] -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP19]]) -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]] -; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] -; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) -; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 -; IEEE-GOODFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fcmp contract olt float [[TMP22]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP24]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fmul contract float [[TMP22]], [[TMP25]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = select contract i1 [[TMP24]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP28]] +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP30]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP30]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP34]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = fmul contract float [[TMP35]], [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = sub i32 [[TMP36]], [[TMP32]] +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP37]], i32 [[TMP38]]) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = insertelement <2 x float> poison, float [[TMP29]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP40]], float [[TMP39]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = fcmp contract olt float [[TMP41]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = select contract i1 [[TMP43]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = fmul contract float [[TMP41]], [[TMP44]] -; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP45]]) -; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = select contract i1 [[TMP43]], float 4.096000e+03, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = fmul contract float [[TMP46]], [[TMP47]] -; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = fcmp contract olt float [[TMP42]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = select contract i1 [[TMP49]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = fmul contract float [[TMP42]], [[TMP50]] -; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP51]]) -; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = select contract i1 [[TMP49]], float 4.096000e+03, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP54:%.*]] = fmul contract float [[TMP52]], [[TMP53]] -; IEEE-GOODFREXP-NEXT: [[TMP55:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0 -; IEEE-GOODFREXP-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP55]], float [[TMP54]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = fcmp contract olt float [[TMP43]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = select contract i1 [[TMP45]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = fmul contract float [[TMP43]], [[TMP46]] +; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP47]]) +; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = select contract i1 [[TMP45]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = fmul contract float [[TMP48]], [[TMP49]] +; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = fcmp contract olt float [[TMP44]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = select contract i1 [[TMP51]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = fmul contract float [[TMP44]], [[TMP52]] +; IEEE-GOODFREXP-NEXT: [[TMP54:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP53]]) +; IEEE-GOODFREXP-NEXT: [[TMP55:%.*]] = select contract i1 [[TMP51]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = fmul contract float [[TMP54]], [[TMP55]] +; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = insertelement <2 x float> poison, float [[TMP50]], i64 0 +; IEEE-GOODFREXP-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP57]], float [[TMP56]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: ret void ; @@ -2197,49 +2209,51 @@ ; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 ; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[X]], i64 1 ; IEEE-BADFREXP-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP20]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP22]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP20]], [[TMP23]] -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP24]]) -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = select contract i1 [[TMP22]], float 4.096000e+03, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = fmul contract float [[TMP25]], [[TMP26]] -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP19]]) -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP19]]) -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) -; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]] -; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] -; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) -; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 -; IEEE-BADFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fcmp contract olt float [[TMP22]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP24]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fmul contract float [[TMP22]], [[TMP25]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = select contract i1 [[TMP24]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP28]] +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP30]], 0 +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = fmul contract float [[TMP35]], [[TMP33]] +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = sub i32 [[TMP36]], [[TMP32]] +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP37]], i32 [[TMP38]]) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = insertelement <2 x float> poison, float [[TMP29]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP40]], float [[TMP39]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = fcmp contract olt float [[TMP41]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = select contract i1 [[TMP43]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = fmul contract float [[TMP41]], [[TMP44]] -; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP45]]) -; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = select contract i1 [[TMP43]], float 4.096000e+03, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = fmul contract float [[TMP46]], [[TMP47]] -; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = fcmp contract olt float [[TMP42]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = select contract i1 [[TMP49]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = fmul contract float [[TMP42]], [[TMP50]] -; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP51]]) -; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = select contract i1 [[TMP49]], float 4.096000e+03, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP54:%.*]] = fmul contract float [[TMP52]], [[TMP53]] -; IEEE-BADFREXP-NEXT: [[TMP55:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0 -; IEEE-BADFREXP-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP55]], float [[TMP54]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = fcmp contract olt float [[TMP43]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = select contract i1 [[TMP45]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = fmul contract float [[TMP43]], [[TMP46]] +; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP47]]) +; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = select contract i1 [[TMP45]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = fmul contract float [[TMP48]], [[TMP49]] +; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = fcmp contract olt float [[TMP44]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = select contract i1 [[TMP51]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = fmul contract float [[TMP44]], [[TMP52]] +; IEEE-BADFREXP-NEXT: [[TMP54:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP53]]) +; IEEE-BADFREXP-NEXT: [[TMP55:%.*]] = select contract i1 [[TMP51]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = fmul contract float [[TMP54]], [[TMP55]] +; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = insertelement <2 x float> poison, float [[TMP50]], i64 0 +; IEEE-BADFREXP-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP57]], float [[TMP56]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: ret void ; @@ -2258,34 +2272,37 @@ ; DAZ-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 ; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP6]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 -; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP10]]) -; DAZ-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP9]]) -; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; DAZ-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 -; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; DAZ-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) -; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 -; DAZ-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP17]], 1 -; DAZ-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP16]] -; DAZ-NEXT: [[TMP21:%.*]] = sub i32 [[TMP19]], [[TMP15]] -; DAZ-NEXT: [[TMP22:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP20]], i32 [[TMP21]]) -; DAZ-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP22]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP8]]) +; DAZ-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP9]]) +; DAZ-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 +; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1 +; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 +; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]]) +; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 +; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 +; DAZ-NEXT: [[TMP19:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0 +; DAZ-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1 +; DAZ-NEXT: [[TMP23:%.*]] = fmul contract float [[TMP21]], [[TMP19]] +; DAZ-NEXT: [[TMP24:%.*]] = sub i32 [[TMP22]], [[TMP18]] +; DAZ-NEXT: [[TMP25:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP24]]) +; DAZ-NEXT: [[TMP26:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0 +; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP26]], float [[TMP25]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; DAZ-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 -; DAZ-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 -; DAZ-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP28:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP26]]) -; DAZ-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP27]]) -; DAZ-NEXT: [[TMP30:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 -; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i64 1 +; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; DAZ-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; DAZ-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP31:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP29]]) +; DAZ-NEXT: [[TMP32:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP30]]) +; DAZ-NEXT: [[TMP33:%.*]] = insertelement <2 x float> poison, float [[TMP31]], i64 0 +; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP33]], float [[TMP32]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -2743,49 +2760,59 @@ define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp(ptr addrspace(1) %out, float %x, float %y, float %sqr.denom) { ; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = fcmp olt float [[SQR_DENOM]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[SQR_DENOM]], i32 [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP11]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP17]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: ret void ; ; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = fcmp olt float [[SQR_DENOM]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[SQR_DENOM]], i32 [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP11]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP17]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 +; DAZ-NEXT: [[DENOM:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[SQR_DENOM]]) ; DAZ-NEXT: [[TMP1:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) ; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP1]] ; DAZ-NEXT: [[TMP2:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) @@ -2805,121 +2832,156 @@ define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp(ptr addrspace(1) %out, <2 x float> %x, <2 x float> %y, <2 x float> %sqr.denom) { ; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP1]], [[TMP10]] -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP2]], [[TMP17]] -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 -; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP22]]) -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP24]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP25]]) -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = fmul arcp contract float [[TMP20]], [[TMP29]] -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP23]]) -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] -; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) -; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) -; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = fmul arcp contract float [[TMP21]], [[TMP36]] -; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i64 0 -; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQR_DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQR_DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = fcmp olt float [[TMP1]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP1]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp olt float [[TMP2]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP2]], i32 [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select i1 [[TMP9]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i64 0 +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = sub i32 0, [[TMP22]] +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP24]], i32 [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fmul arcp contract float [[TMP16]], [[TMP25]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP27]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = sub i32 0, [[TMP29]] +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP28]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP31]], i32 [[TMP30]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = fmul arcp contract float [[TMP17]], [[TMP32]] +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0 +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP37]]) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = sub i32 0, [[TMP41]] +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP40]]) +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP43]], i32 [[TMP42]]) +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = fmul arcp contract float [[TMP35]], [[TMP44]] +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP38]]) +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = extractvalue { float, i32 } [[TMP46]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = extractvalue { float, i32 } [[TMP46]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = sub i32 0, [[TMP48]] +; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP47]]) +; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP49]]) +; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = fmul arcp contract float [[TMP36]], [[TMP51]] +; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = insertelement <2 x float> poison, float [[TMP45]], i64 0 +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP52]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-GOODFREXP-NEXT: ret void ; ; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP1]], [[TMP10]] -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP2]], [[TMP17]] -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 -; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP22]]) -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP22]]) -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP25]]) -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = fmul arcp contract float [[TMP20]], [[TMP29]] -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP23]]) -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP23]]) -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] -; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) -; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) -; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = fmul arcp contract float [[TMP21]], [[TMP36]] -; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i64 0 -; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQR_DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQR_DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = fcmp olt float [[TMP1]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP1]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp olt float [[TMP2]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP2]], i32 [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select i1 [[TMP9]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i64 0 +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = sub i32 0, [[TMP22]] +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP24]], i32 [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fmul arcp contract float [[TMP16]], [[TMP25]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = sub i32 0, [[TMP29]] +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP31]], i32 [[TMP30]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = fmul arcp contract float [[TMP17]], [[TMP32]] +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0 +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP37]]) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP37]]) +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = sub i32 0, [[TMP41]] +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP40]]) +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP43]], i32 [[TMP42]]) +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = fmul arcp contract float [[TMP35]], [[TMP44]] +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP38]]) +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = extractvalue { float, i32 } [[TMP46]], 0 +; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP38]]) +; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = sub i32 0, [[TMP48]] +; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP47]]) +; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP49]]) +; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = fmul arcp contract float [[TMP36]], [[TMP51]] +; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = insertelement <2 x float> poison, float [[TMP45]], i64 0 +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP52]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; DAZ-NEXT: [[TMP6:%.*]] = fmul arcp contract float [[TMP1]], [[TMP5]] -; DAZ-NEXT: [[TMP7:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; DAZ-NEXT: [[TMP8:%.*]] = fmul arcp contract float [[TMP2]], [[TMP7]] -; DAZ-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0 -; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP8]], i64 1 -; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP14:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; DAZ-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP10]], [[TMP14]] -; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; DAZ-NEXT: [[TMP17:%.*]] = fmul arcp contract float [[TMP11]], [[TMP16]] -; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0 -; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP17]], i64 1 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQR_DENOM]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQR_DENOM]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 +; DAZ-NEXT: [[DENOM:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i64 1 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; DAZ-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP6]], [[TMP10]] +; DAZ-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; DAZ-NEXT: [[TMP13:%.*]] = fmul arcp contract float [[TMP7]], [[TMP12]] +; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 +; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP20:%.*]] = fmul arcp contract float [[TMP15]], [[TMP19]] +; DAZ-NEXT: [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP18]]) +; DAZ-NEXT: [[TMP22:%.*]] = fmul arcp contract float [[TMP16]], [[TMP21]] +; DAZ-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP20]], i64 0 +; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP22]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void @@ -2935,28 +2997,33 @@ define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3(ptr addrspace(1) %out, float %x, float %y, float %z, float %sqr.denom) { ; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3 ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-GOODFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp contract float [[Z]], [[TMP18]] +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = fcmp olt float [[SQR_DENOM]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[SQR_DENOM]], i32 [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP11]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp contract float [[Z]], [[TMP23]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 @@ -2964,28 +3031,33 @@ ; ; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3 ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-BADFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp contract float [[Z]], [[TMP18]] +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = fcmp olt float [[SQR_DENOM]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[SQR_DENOM]], i32 [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP11]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp contract float [[Z]], [[TMP23]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 @@ -2993,7 +3065,7 @@ ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 +; DAZ-NEXT: [[DENOM:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[SQR_DENOM]]) ; DAZ-NEXT: [[TMP1:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) ; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP1]] ; DAZ-NEXT: [[TMP2:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) @@ -3018,140 +3090,155 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) { ; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 ; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP10]] -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = fmul contract float [[TMP12]], [[TMP13]] -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP6]], [[TMP16]] -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP21]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = fcmp contract olt float [[TMP9]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP13]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP9]], [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP13]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fmul contract float [[TMP16]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fcmp contract olt float [[TMP10]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = select contract i1 [[TMP19]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = fmul contract float [[TMP10]], [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP19]], float -4.096000e+03, float -1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP22]], [[TMP23]] +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP7]]) ; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 ; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[TMP26]], [[TMP24]] -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP23]] -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP29]]) -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) -; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP28]] +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]]) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) ; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0 ; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = extractvalue { float, i32 } [[TMP35]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]] -; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]] -; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]]) -; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP20]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP30]], i64 2 -; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP43]], float [[TMP40]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP36]]) +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP40]], [[TMP38]] +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = sub i32 [[TMP41]], [[TMP37]] +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP42]], i32 [[TMP43]]) +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP18]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP24]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP34]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 ; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP10]] -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = fmul contract float [[TMP12]], [[TMP13]] -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP6]], [[TMP16]] -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = fcmp contract olt float [[TMP9]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP13]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP9]], [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP13]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fmul contract float [[TMP16]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fcmp contract olt float [[TMP10]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = select contract i1 [[TMP19]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = fmul contract float [[TMP10]], [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP19]], float -4.096000e+03, float -1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP22]], [[TMP23]] +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP7]]) ; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 4.000000e+00) -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[TMP26]], [[TMP24]] -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP23]] -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP29]]) -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) -; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP28]] +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) ; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0 -; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) -; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]] -; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]] -; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]]) -; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP20]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP30]], i64 2 -; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP43]], float [[TMP40]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP36]]) +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP40]], [[TMP38]] +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = sub i32 [[TMP41]], [[TMP37]] +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP42]], i32 [[TMP43]]) +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP18]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP24]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP34]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; DAZ-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP5]]) -; DAZ-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) -; DAZ-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP10]] -; DAZ-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; DAZ-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 -; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) -; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 -; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 -; DAZ-NEXT: [[TMP19:%.*]] = fmul contract float [[TMP17]], [[TMP15]] -; DAZ-NEXT: [[TMP20:%.*]] = sub i32 [[TMP18]], [[TMP14]] -; DAZ-NEXT: [[TMP21:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP19]], i32 [[TMP20]]) -; DAZ-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; DAZ-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 -; DAZ-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) -; DAZ-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) -; DAZ-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 -; DAZ-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1 -; DAZ-NEXT: [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP25]] -; DAZ-NEXT: [[TMP30:%.*]] = sub i32 [[TMP28]], [[TMP24]] -; DAZ-NEXT: [[TMP31:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP30]]) -; DAZ-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 -; DAZ-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP11]], i64 1 -; DAZ-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP21]], i64 2 -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 3 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1 +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2 +; DAZ-NEXT: [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP17:%.*]] = fneg contract float [[TMP13]] +; DAZ-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]]) +; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; DAZ-NEXT: [[TMP22:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; DAZ-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; DAZ-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; DAZ-NEXT: [[TMP26:%.*]] = fmul contract float [[TMP24]], [[TMP22]] +; DAZ-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; DAZ-NEXT: [[TMP28:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) +; DAZ-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP15]]) +; DAZ-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 +; DAZ-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1 +; DAZ-NEXT: [[TMP32:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP30]]) +; DAZ-NEXT: [[TMP33:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP33]], 0 +; DAZ-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP33]], 1 +; DAZ-NEXT: [[TMP36:%.*]] = fmul contract float [[TMP34]], [[TMP32]] +; DAZ-NEXT: [[TMP37:%.*]] = sub i32 [[TMP35]], [[TMP31]] +; DAZ-NEXT: [[TMP38:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP37]]) +; DAZ-NEXT: [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i64 0 +; DAZ-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP18]], i64 1 +; DAZ-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP28]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -3286,11 +3373,32 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(<4 x float> %arg) { -; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div -; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> , [[DENOM]] -; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div +; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> , [[DENOM]] +; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div +; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1 +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2 +; DAZ-NEXT: [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> , [[DENOM]] +; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 %partial.rsq = fdiv contract afn <4 x float> , %denom @@ -3298,11 +3406,32 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(<4 x float> %arg) { -; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv -; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> , [[DENOM]] -; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv +; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> , [[DENOM]] +; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv +; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1 +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2 +; DAZ-NEXT: [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> , [[DENOM]] +; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 %partial.rsq = fdiv contract <4 x float> , %denom @@ -3446,112 +3575,127 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %arg) { ; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp ; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 ; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp arcp contract olt float [[TMP5]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = select arcp contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP5]], [[TMP10]] -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select arcp contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = fmul arcp contract float [[TMP12]], [[TMP13]] -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fcmp arcp contract olt float [[TMP6]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = select arcp contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul arcp contract float [[TMP6]], [[TMP16]] -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = select arcp contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = fmul arcp contract float [[TMP18]], [[TMP19]] -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP21]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = sub i32 0, [[TMP23]] -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP24]]) -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP26]] -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]] -; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2 -; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = fcmp arcp contract olt float [[TMP9]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = select arcp contract i1 [[TMP13]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP9]], [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = select arcp contract i1 [[TMP13]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP16]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fcmp arcp contract olt float [[TMP10]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = select arcp contract i1 [[TMP19]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = fmul arcp contract float [[TMP10]], [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = select arcp contract i1 [[TMP19]], float -4.096000e+03, float -1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fmul arcp contract float [[TMP22]], [[TMP23]] +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul arcp contract float undef, [[TMP37]] +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP18]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP24]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP31]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp ; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 ; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp arcp contract olt float [[TMP5]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = select arcp contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP5]], [[TMP10]] -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select arcp contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = fmul arcp contract float [[TMP12]], [[TMP13]] -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fcmp arcp contract olt float [[TMP6]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = select arcp contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul arcp contract float [[TMP6]], [[TMP16]] -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = select arcp contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = fmul arcp contract float [[TMP18]], [[TMP19]] -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = sub i32 0, [[TMP23]] -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP24]]) -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP26]] -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]] -; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2 -; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = fcmp arcp contract olt float [[TMP9]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = select arcp contract i1 [[TMP13]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP9]], [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = select arcp contract i1 [[TMP13]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP16]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fcmp arcp contract olt float [[TMP10]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = select arcp contract i1 [[TMP19]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = fmul arcp contract float [[TMP10]], [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = select arcp contract i1 [[TMP19]], float -4.096000e+03, float -1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fmul arcp contract float [[TMP22]], [[TMP23]] +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul arcp contract float undef, [[TMP37]] +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP18]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP24]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP31]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; DAZ-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP5]]) -; DAZ-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) -; DAZ-NEXT: [[TMP11:%.*]] = fneg arcp contract float [[TMP10]] -; DAZ-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; DAZ-NEXT: [[TMP13:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP12]] -; DAZ-NEXT: [[TMP14:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; DAZ-NEXT: [[TMP15:%.*]] = fmul arcp contract float undef, [[TMP14]] -; DAZ-NEXT: [[TMP16:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 -; DAZ-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP11]], i64 1 -; DAZ-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP13]], i64 2 -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP15]], i64 3 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1 +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2 +; DAZ-NEXT: [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP17:%.*]] = fneg arcp contract float [[TMP13]] +; DAZ-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; DAZ-NEXT: [[TMP20:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP19]] +; DAZ-NEXT: [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP15]]) +; DAZ-NEXT: [[TMP22:%.*]] = fmul arcp contract float undef, [[TMP21]] +; DAZ-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i64 0 +; DAZ-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP18]], i64 1 +; DAZ-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP20]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP22]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -3560,11 +3704,32 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(<4 x float> %arg) { -; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct -; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> , [[DENOM]] -; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct +; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> , [[DENOM]] +; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct +; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1 +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2 +; DAZ-NEXT: [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> , [[DENOM]] +; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 %partial.rsq = fdiv contract arcp <4 x float> , %denom @@ -3687,110 +3852,151 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float> %x) { ; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_const_denom ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; IEEE-GOODFREXP-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2 +; IEEE-GOODFREXP-NEXT: [[SQRT:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP4]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) ; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 ; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP11]] -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP10]] -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP16]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP7]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP17]], i64 2 -; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP28]] +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]]) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = extractvalue { float, i32 } [[TMP35]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP36]]) +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP40]], [[TMP38]] +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = sub i32 [[TMP41]], [[TMP37]] +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP42]], i32 [[TMP43]]) +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP24]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP34]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_const_denom ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; IEEE-BADFREXP-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2 +; IEEE-BADFREXP-NEXT: [[SQRT:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP4]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) ; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP11]] -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP10]] -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP16]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 2.000000e+00) -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP7]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP17]], i64 2 -; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP28]] +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP36]]) +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP40]], [[TMP38]] +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = sub i32 [[TMP41]], [[TMP37]] +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP42]], i32 [[TMP43]]) +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP24]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP34]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_const_denom ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 -; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) -; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) -; DAZ-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] -; DAZ-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) -; DAZ-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 -; DAZ-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 -; DAZ-NEXT: [[TMP11:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) -; DAZ-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) -; DAZ-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 -; DAZ-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP11]] -; DAZ-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP10]] -; DAZ-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP16]]) -; DAZ-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) -; DAZ-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 -; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 -; DAZ-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) -; DAZ-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) -; DAZ-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 -; DAZ-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] -; DAZ-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] -; DAZ-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) -; DAZ-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; DAZ-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP7]], i64 1 -; DAZ-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP17]], i64 2 -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 3 +; DAZ-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00) +; DAZ-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00) +; DAZ-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00) +; DAZ-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef) +; DAZ-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2 +; DAZ-NEXT: [[SQRT:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP4]], i64 3 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; DAZ-NEXT: [[TMP13:%.*]] = fneg contract float [[TMP9]] +; DAZ-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) +; DAZ-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0 +; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1 +; DAZ-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; DAZ-NEXT: [[TMP22:%.*]] = fmul contract float [[TMP20]], [[TMP18]] +; DAZ-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[TMP17]] +; DAZ-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP23]]) +; DAZ-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; DAZ-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; DAZ-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 +; DAZ-NEXT: [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; DAZ-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; DAZ-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 +; DAZ-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1 +; DAZ-NEXT: [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP28]] +; DAZ-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP27]] +; DAZ-NEXT: [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]]) +; DAZ-NEXT: [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0 +; DAZ-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP14]], i64 1 +; DAZ-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP24]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll @@ -16,21 +16,52 @@ } define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) { -; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32 -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) -; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 -; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4 -; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void +; IEEE-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32 +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] { +; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP2]]) +; IEEE-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP6:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 32, i32 0 +; IEEE-NEXT: [[TMP8:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP7]]) +; IEEE-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select i1 [[TMP6]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP10]]) +; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP11:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 32, i32 0 +; IEEE-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select i1 [[TMP11]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_2ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP15]]) +; IEEE-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32 +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] { +; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: ret void ; %no.md = call float @llvm.sqrt.f32(float %x) store volatile float %no.md, ptr addrspace(1) %out, align 4 @@ -53,21 +84,104 @@ } define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float> %x) { -; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_v2f32 -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1 -; CHECK-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; CHECK-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; CHECK-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !0 -; CHECK-NEXT: store volatile <2 x float> [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !4 -; CHECK-NEXT: store volatile <2 x float> [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void +; IEEE-LABEL: define amdgpu_kernel void @sqrt_fpmath_v2f32 +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) +; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1 +; IEEE-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 +; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP5:%.*]] = fcmp olt float [[TMP3]], 0x3810000000000000 +; IEEE-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 32, i32 0 +; IEEE-NEXT: [[TMP7:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP3]], i32 [[TMP6]]) +; IEEE-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP7]]) +; IEEE-NEXT: [[TMP9:%.*]] = select i1 [[TMP5]], i32 -16, i32 0 +; IEEE-NEXT: [[TMP10:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) +; IEEE-NEXT: [[TMP11:%.*]] = fcmp olt float [[TMP4]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 32, i32 0 +; IEEE-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select i1 [[TMP11]], i32 -16, i32 0 +; IEEE-NEXT: [[TMP16:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP15]]) +; IEEE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 +; IEEE-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP20:%.*]] = fcmp olt float [[TMP18]], 0x3810000000000000 +; IEEE-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 32, i32 0 +; IEEE-NEXT: [[TMP22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP18]], i32 [[TMP21]]) +; IEEE-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP22]]) +; IEEE-NEXT: [[TMP24:%.*]] = select i1 [[TMP20]], i32 -16, i32 0 +; IEEE-NEXT: [[TMP25:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP24]]) +; IEEE-NEXT: [[TMP26:%.*]] = fcmp olt float [[TMP19]], 0x3810000000000000 +; IEEE-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 32, i32 0 +; IEEE-NEXT: [[TMP28:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP19]], i32 [[TMP27]]) +; IEEE-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP28]]) +; IEEE-NEXT: [[TMP30:%.*]] = select i1 [[TMP26]], i32 -16, i32 0 +; IEEE-NEXT: [[TMP31:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP30]]) +; IEEE-NEXT: [[TMP32:%.*]] = insertelement <2 x float> poison, float [[TMP25]], i64 0 +; IEEE-NEXT: [[MD_3ULP:%.*]] = insertelement <2 x float> [[TMP32]], float [[TMP31]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP33:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP35:%.*]] = fcmp olt float [[TMP33]], 0x3810000000000000 +; IEEE-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 32, i32 0 +; IEEE-NEXT: [[TMP37:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP33]], i32 [[TMP36]]) +; IEEE-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP37]]) +; IEEE-NEXT: [[TMP39:%.*]] = select i1 [[TMP35]], i32 -16, i32 0 +; IEEE-NEXT: [[TMP40:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]]) +; IEEE-NEXT: [[TMP41:%.*]] = fcmp olt float [[TMP34]], 0x3810000000000000 +; IEEE-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 32, i32 0 +; IEEE-NEXT: [[TMP43:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP34]], i32 [[TMP42]]) +; IEEE-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP43]]) +; IEEE-NEXT: [[TMP45:%.*]] = select i1 [[TMP41]], i32 -16, i32 0 +; IEEE-NEXT: [[TMP46:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP44]], i32 [[TMP45]]) +; IEEE-NEXT: [[TMP47:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0 +; IEEE-NEXT: [[MD_2ULP:%.*]] = insertelement <2 x float> [[TMP47]], float [[TMP46]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @sqrt_fpmath_v2f32 +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) +; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1 +; DAZ-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 +; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP7]]) +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i64 0 +; DAZ-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP9]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP11]]) +; DAZ-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 +; DAZ-NEXT: [[MD_3ULP:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP16]]) +; DAZ-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP20:%.*]] = insertelement <2 x float> poison, float [[TMP18]], i64 0 +; DAZ-NEXT: [[MD_2ULP:%.*]] = insertelement <2 x float> [[TMP20]], float [[TMP19]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: ret void ; %no.md = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x) store volatile <2 x float> %no.md, ptr addrspace(1) %out, align 4 @@ -91,18 +205,18 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub(ptr addrspace(1) %out, float nofpclass(sub) %x) { ; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1]] { +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 ; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3 +; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0 +; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4 +; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; @@ -127,21 +241,52 @@ } define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out, float nofpclass(nzero) %x) { -; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) -; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 -; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4 -; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void +; IEEE-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP2]]) +; IEEE-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP6:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 32, i32 0 +; IEEE-NEXT: [[TMP8:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP7]]) +; IEEE-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select i1 [[TMP6]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP10]]) +; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP11:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 32, i32 0 +; IEEE-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select i1 [[TMP11]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_2ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP15]]) +; IEEE-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: ret void ; %no.md = call float @llvm.sqrt.f32(float %x) store volatile float %no.md, ptr addrspace(1) %out, align 4 @@ -164,21 +309,52 @@ } define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1) %out, float nofpclass(nzero nsub) %x) { -; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) -; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 -; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4 -; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void +; IEEE-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP2]]) +; IEEE-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP6:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 32, i32 0 +; IEEE-NEXT: [[TMP8:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP7]]) +; IEEE-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select i1 [[TMP6]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP10]]) +; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP11:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 32, i32 0 +; IEEE-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select i1 [[TMP11]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_2ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP15]]) +; IEEE-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: ret void ; %no.md = call float @llvm.sqrt.f32(float %x) store volatile float %no.md, ptr addrspace(1) %out, align 4 @@ -201,21 +377,52 @@ } define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrspace(1) %out, float nofpclass(nzero nsub inf) %x) { -; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) -; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 -; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4 -; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void +; IEEE-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP2]]) +; IEEE-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP6:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 32, i32 0 +; IEEE-NEXT: [[TMP8:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP7]]) +; IEEE-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select i1 [[TMP6]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP10]]) +; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP11:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 32, i32 0 +; IEEE-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select i1 [[TMP11]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_2ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP15]]) +; IEEE-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: ret void ; %no.md = call float @llvm.sqrt.f32(float %x) store volatile float %no.md, ptr addrspace(1) %out, align 4 @@ -238,21 +445,52 @@ } define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub(ptr addrspace(1) %out, float nofpclass(psub) %x) { -; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(psub) [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) -; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 -; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4 -; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void +; IEEE-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(psub) [[X:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0 +; IEEE-NEXT: [[TMP3:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP2]]) +; IEEE-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP4]], i32 [[TMP5]]) +; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP6:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 32, i32 0 +; IEEE-NEXT: [[TMP8:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP7]]) +; IEEE-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select i1 [[TMP6]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP10]]) +; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: [[TMP11:%.*]] = fcmp olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 32, i32 0 +; IEEE-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[X]], i32 [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select i1 [[TMP11]], i32 -16, i32 0 +; IEEE-NEXT: [[MD_2ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP15]]) +; IEEE-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(psub) [[X:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) +; DAZ-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: ret void ; %no.md = call float @llvm.sqrt.f32(float %x) store volatile float %no.md, ptr addrspace(1) %out, align 4 @@ -321,13 +559,13 @@ ; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1 ; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3 +; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0 +; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4 +; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]]) ; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: [[MD_3ULP_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0 ; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -378,6 +616,3 @@ !2 = !{float 1.000000e+00} !3 = !{float 3.000000e+00} !4 = !{float 2.000000e+00} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; DAZ: {{.*}} -; IEEE: {{.*}} Index: llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -97,7 +97,7 @@ br i1 %8, label %if1, label %endloop if1: ; preds = %Flow2 - %v3 = call float @llvm.sqrt.f32(float %v0) + %v3 = call afn float @llvm.sqrt.f32(float %v0) br label %endloop endif1: ; preds = %loop Index: llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -83,7 +83,7 @@ %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 - %v = call float @llvm.sqrt.f32(float %load) + %v = call afn float @llvm.sqrt.f32(float %load) %canonicalized = tail call float @llvm.canonicalize.f32(float %v) store float %canonicalized, ptr addrspace(1) %gep, align 4 ret void Index: llvm/test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -618,6 +618,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_sqrt_f32_e32 v2, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 @@ -631,7 +634,6 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -714,6 +716,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_sqrt_f32_e32 v2, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, -1.0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v2, -1.0 @@ -727,7 +732,6 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; SI-NEXT: v_div_fixup_f32 v2, v3, v2, -1.0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -815,6 +819,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_sqrt_f32_e32 v3, v3 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, 1.0 ; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 @@ -828,7 +835,6 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; SI-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -922,6 +928,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_sqrt_f32_e32 v2, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 @@ -935,7 +944,6 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -1023,6 +1031,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_sqrt_f32_e32 v2, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 @@ -1036,7 +1047,6 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -1124,6 +1134,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_sqrt_f32_e32 v2, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, -1.0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v2, -1.0 @@ -1137,7 +1150,6 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; SI-NEXT: v_div_fixup_f32 v2, v3, v2, -1.0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -1828,6 +1840,10 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_sqrt_f32_e32 v0, v0 ; SI-NEXT: v_sqrt_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 @@ -1841,18 +1857,18 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 ; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-NEXT: v_rcp_f32_e32 v2, v3 -; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v2, 1.0 -; SI-NEXT: v_fma_f32 v2, v5, v2, v2 -; SI-NEXT: v_mul_f32_e32 v5, v4, v2 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v4 -; SI-NEXT: v_fma_f32 v5, v6, v2, v5 -; SI-NEXT: v_fma_f32 v3, -v3, v5, v4 +; SI-NEXT: v_fma_f32 v2, -v3, v4, 1.0 +; SI-NEXT: v_fma_f32 v2, v2, v4, v4 +; SI-NEXT: v_mul_f32_e32 v4, v5, v2 +; SI-NEXT: v_fma_f32 v6, -v3, v4, v5 +; SI-NEXT: v_fma_f32 v4, v6, v2, v4 +; SI-NEXT: v_fma_f32 v3, -v3, v4, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v4 ; SI-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2022,6 +2038,10 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_sqrt_f32_e32 v0, v0 ; SI-NEXT: v_sqrt_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 @@ -2035,18 +2055,18 @@ ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, -1.0 +; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v1, -1.0 ; SI-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 -; SI-NEXT: v_rcp_f32_e32 v2, v3 -; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v2, 1.0 -; SI-NEXT: v_fma_f32 v2, v5, v2, v2 -; SI-NEXT: v_mul_f32_e32 v5, v4, v2 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v4 -; SI-NEXT: v_fma_f32 v5, v6, v2, v5 -; SI-NEXT: v_fma_f32 v3, -v3, v5, v4 +; SI-NEXT: v_fma_f32 v2, -v3, v4, 1.0 +; SI-NEXT: v_fma_f32 v2, v2, v4, v4 +; SI-NEXT: v_mul_f32_e32 v4, v5, v2 +; SI-NEXT: v_fma_f32 v6, -v3, v4, v5 +; SI-NEXT: v_fma_f32 v4, v6, v2, v4 +; SI-NEXT: v_fma_f32 v3, -v3, v4, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v4 ; SI-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -303,41 +303,281 @@ } define float @v_fdiv_recip_sqrt_f32(float %x) { -; IEEE-LABEL: v_fdiv_recip_sqrt_f32: -; IEEE: ; %bb.0: -; IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IEEE-NEXT: s_setpc_b64 s[30:31] +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CODEGEN-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v2, v3, v2, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; DAZ-LABEL: v_fdiv_recip_sqrt_f32: -; DAZ: ; %bb.0: -; DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; DAZ-NEXT: v_rcp_f32_e32 v2, v1 -; DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 -; DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 -; DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 -; DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 -; DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 -; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; DAZ-NEXT: s_setpc_b64 s[30:31] +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v3, v2, v2 +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CODEGEN-DAZ-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-DAZ-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CODEGEN-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %x) %fdiv = fdiv float 1.0, %sqrt ret float %fdiv @@ -365,7 +605,24 @@ ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: ; CODEGEN-IEEE-GISEL: ; %bb.0: ; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -388,7 +645,24 @@ ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: ; IR-IEEE-GISEL: ; %bb.0: ; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -411,7 +685,23 @@ ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: ; CODEGEN-DAZ-GISEL: ; %bb.0: ; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -436,7 +726,23 @@ ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: ; IR-DAZ-GISEL: ; %bb.0: ; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -523,7 +829,24 @@ ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: ; CODEGEN-IEEE-GISEL: ; %bb.0: ; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -546,7 +869,24 @@ ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: ; IR-IEEE-GISEL: ; %bb.0: ; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -569,7 +909,23 @@ ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: ; CODEGEN-DAZ-GISEL: ; %bb.0: ; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -594,7 +950,23 @@ ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: ; IR-DAZ-GISEL: ; %bb.0: ; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -615,12 +987,193 @@ } define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) { -; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_sqrt_f32_e32 v0, v0 -; CHECK-NEXT: v_rcp_f32_e32 v0, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CODEGEN-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %x) %fdiv = fdiv afn float 1.0, %sqrt ret float %fdiv @@ -636,7 +1189,24 @@ ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: ; CODEGEN-IEEE-GISEL: ; %bb.0: ; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 ; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -649,7 +1219,24 @@ ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: ; IR-IEEE-GISEL: ; %bb.0: ; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 ; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -662,7 +1249,23 @@ ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: ; CODEGEN-DAZ-GISEL: ; %bb.0: ; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 ; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -675,7 +1278,23 @@ ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: ; IR-DAZ-GISEL: ; %bb.0: ; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 ; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %x) @@ -878,7 +1497,14 @@ ; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25: ; CODEGEN-IEEE-SDAG: ; %bb.0: ; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v1, v1 ; CODEGEN-IEEE-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -889,7 +1515,13 @@ ; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25: ; CODEGEN-IEEE-GISEL: ; %bb.0: ; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-GISEL-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v1, v1 ; CODEGEN-IEEE-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -897,22 +1529,52 @@ ; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; IR-IEEE-LABEL: v_recip_sqrt_f32_ulp25: -; IR-IEEE: ; %bb.0: -; IR-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; IR-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IR-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; IR-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IR-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IR-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; IR-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; IR-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IR-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; IR-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IR-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IR-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IR-IEEE-NEXT: s_setpc_b64 s[30:31] +; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; IR-IEEE-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25: ; CODEGEN-DAZ: ; %bb.0: @@ -973,22 +1635,73 @@ ; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; IR-IEEE-LABEL: v_recip_sqrt_f32_ulp25_contract: -; IR-IEEE: ; %bb.0: -; IR-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; IR-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IR-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; IR-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IR-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IR-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; IR-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; IR-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IR-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; IR-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IR-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IR-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IR-IEEE-NEXT: s_setpc_b64 s[30:31] +; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0 +; IR-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v3, v2, v2 +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 +; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 +; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract: ; CODEGEN-DAZ: ; %bb.0: @@ -996,24 +1709,75 @@ ; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 ; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract: -; IR-DAZ: ; %bb.0: -; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1 -; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 -; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 -; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 -; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IR-DAZ-NEXT: s_setpc_b64 s[30:31] +; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-DAZ-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 +; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0 %fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0 ret float %fdiv @@ -1044,18 +1808,12 @@ ; CODEGEN-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0 ; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: -; IR-IEEE-SDAG: ; %bb.0: -; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 -; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 -; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: -; IR-IEEE-GISEL: ; %bb.0: -; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0 -; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; IR-IEEE-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; IR-IEEE: ; %bb.0: +; IR-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; IR-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: ; CODEGEN-DAZ: ; %bb.0: @@ -1063,18 +1821,12 @@ ; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 ; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: -; IR-DAZ-SDAG: ; %bb.0: -; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 -; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0 -; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: -; IR-DAZ-GISEL: ; %bb.0: -; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v0, v0 -; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; IR-DAZ-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; IR-DAZ: ; %bb.0: +; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-NEXT: v_rcp_f32_e32 v0, v0 +; IR-DAZ-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn float @llvm.sqrt.f32(float %x), !fpmath !0 %fdiv = fdiv contract afn float 1.0, %sqrt, !fpmath !0 ret float %fdiv Index: llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -6,43 +6,389 @@ ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,GISEL,GCN-DAZ,GISEL-DAZ %s define float @v_sqrt_f32(float %x) { -; GCN-LABEL: v_sqrt_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_fneg(float %x) { -; GCN-LABEL: v_sqrt_f32_fneg: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_fneg: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x8f800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_fneg: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-IEEE-NEXT: v_mul_f32_e64 v2, -v0, v2 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, v1, -v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_fneg: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x8f800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_fneg: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-DAZ-NEXT: v_mul_f32_e64 v2, -v0, v2 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, v1, -v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %x.neg = fneg float %x %result = call float @llvm.sqrt.f32(float %x.neg) ret float %result } define float @v_sqrt_f32_fabs(float %x) { -; GCN-LABEL: v_sqrt_f32_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e64 v0, |v0| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_fabs: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: s_mov_b32 s5, 0x4f800000 +; SDAG-IEEE-NEXT: v_mul_f32_e64 v1, |v0|, s5 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, |v0|, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_fabs: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-IEEE-NEXT: v_mul_f32_e64 v2, |v0|, v2 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, v1, |v0| +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, |v0|, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_fabs: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: s_mov_b32 s5, 0x4f800000 +; SDAG-DAZ-NEXT: v_mul_f32_e64 v1, |v0|, s5 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, |v0|, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_fabs: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-DAZ-NEXT: v_mul_f32_e64 v2, |v0|, v2 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, v1, |v0| +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v0, |v0|, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %x.fabs = call float @llvm.fabs.f32(float %x) %result = call float @llvm.sqrt.f32(float %x.fabs) ret float %result } define float @v_sqrt_f32_fneg_fabs(float %x) { -; GCN-LABEL: v_sqrt_f32_fneg_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e64 v0, -|v0| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_fneg_fabs: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x8f800000 +; SDAG-IEEE-NEXT: s_mov_b32 s5, 0xcf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e64 v1, |v0|, s5 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, -|v0|, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_fneg_fabs: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-IEEE-NEXT: v_mul_f32_e64 v2, -|v0|, v2 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, v1, -|v0| +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, -|v0|, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_fneg_fabs: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x8f800000 +; SDAG-DAZ-NEXT: s_mov_b32 s5, 0xcf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e64 v1, |v0|, s5 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, -|v0|, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_fneg_fabs: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-DAZ-NEXT: v_mul_f32_e64 v2, -|v0|, v2 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, v1, -|v0| +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v0, -|v0|, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %x.fabs = call float @llvm.fabs.f32(float %x) %x.fabs.neg = fneg float %x.fabs %result = call float @llvm.sqrt.f32(float %x.fabs.neg) @@ -50,41 +396,385 @@ } define float @v_sqrt_f32_ninf(float %x) { -; GCN-LABEL: v_sqrt_f32_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ninf: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ninf: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_ninf: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_ninf: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call ninf float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_no_infs_attribute(float %x) #5 { -; GCN-LABEL: v_sqrt_f32_no_infs_attribute: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_no_infs_attribute: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_no_infs_attribute: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_no_infs_attribute: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_no_infs_attribute: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call ninf float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_nnan(float %x) { -; GCN-LABEL: v_sqrt_f32_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_nnan: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_nnan: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_nnan: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_nnan: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call nnan float @llvm.sqrt.f32(float %x) ret float %result } define amdgpu_ps i32 @s_sqrt_f32(float inreg %x) { -; GCN-LABEL: s_sqrt_f32: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f32_e32 v0, s0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; SDAG-IEEE-LABEL: s_sqrt_f32: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-IEEE-NEXT: ; return to shader part epilog +; +; GISEL-IEEE-LABEL: s_sqrt_f32: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, s0 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, s0, v2 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-IEEE-NEXT: ; return to shader part epilog +; +; SDAG-DAZ-LABEL: s_sqrt_f32: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-DAZ-NEXT: ; return to shader part epilog +; +; GISEL-DAZ-LABEL: s_sqrt_f32: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, s0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, s0, v2 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-DAZ-NEXT: ; return to shader part epilog %result = call float @llvm.sqrt.f32(float %x) %cast = bitcast float %result to i32 %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) @@ -92,11 +782,103 @@ } define amdgpu_ps i32 @s_sqrt_f32_ninf(float inreg %x) { -; GCN-LABEL: s_sqrt_f32_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: v_sqrt_f32_e32 v0, s0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; SDAG-IEEE-LABEL: s_sqrt_f32_ninf: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-IEEE-NEXT: ; return to shader part epilog +; +; GISEL-IEEE-LABEL: s_sqrt_f32_ninf: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, s0 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, s0, v2 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-IEEE-NEXT: ; return to shader part epilog +; +; SDAG-DAZ-LABEL: s_sqrt_f32_ninf: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-DAZ-NEXT: ; return to shader part epilog +; +; GISEL-DAZ-LABEL: s_sqrt_f32_ninf: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, s0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, s0, v2 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-DAZ-NEXT: ; return to shader part epilog %result = call ninf float @llvm.sqrt.f32(float %x) %cast = bitcast float %result to i32 %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) @@ -128,31 +910,283 @@ } define float @v_sqrt_f32_nsz(float %x) { -; GCN-LABEL: v_sqrt_f32_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_nsz: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_nsz: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_nsz: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_nsz: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call nsz float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_nnan_ninf(float %x) { -; GCN-LABEL: v_sqrt_f32_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_nnan_ninf: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_nnan_ninf: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_nnan_ninf: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_nnan_ninf: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call nnan ninf float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_nnan_ninf_nsz(float %x) { -; GCN-LABEL: v_sqrt_f32_nnan_ninf_nsz: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_nnan_ninf_nsz: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_nnan_ninf_nsz: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_nnan_ninf_nsz: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_nnan_ninf_nsz: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call nnan ninf nsz float @llvm.sqrt.f32(float %x) ret float %result } @@ -271,124 +1305,908 @@ } define <2 x float> @v_sqrt_v2f32(<2 x float> %x) { -; GCN-LABEL: v_sqrt_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s6, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v2, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SDAG-IEEE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SDAG-IEEE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v4, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v2, v4, v1 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; SDAG-IEEE-NEXT: v_fma_f32 v4, -v5, v4, v1 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v1, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v2f32: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v2, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0xf800000 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v2, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v6, v2, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-IEEE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v1 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v3, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v3, v1 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v2, v3, v1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v6, v3, v1 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v1, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_v2f32: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v2, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, v0, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v4, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v5, -v3, v3, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v4, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v5, v2, v3 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0x4f800000, v1 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v3, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v4 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v1, v3 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v5, -v3, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v5, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v2, v2, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v6, v3, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v1, v4 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_v2f32: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v2, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v3, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, v0, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v5, -v2, v4, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v4, v4, v5, v4 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v5, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v5, -v4, v4, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v5, v2, v4 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v3, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v3, v1 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v4, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v4 +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v1, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v5, -v3, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v5, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v5, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v5, -v2, v2, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v5, v3, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v1, v4 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x) ret <2 x float> %result } define <3 x float> @v_sqrt_v3f32(<3 x float> %x) { -; GCN-LABEL: v_sqrt_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-NEXT: v_sqrt_f32_e32 v2, v2 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %x) - ret <3 x float> %result -} - -; fpmath should be ignored -define float @v_sqrt_f32_ulp05(float %x) { -; GCN-LABEL: v_sqrt_f32_ulp05: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call float @llvm.sqrt.f32(float %x), !fpmath !0 - ret float %result +; SDAG-IEEE-LABEL: v_sqrt_v3f32: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s6, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v3 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v3 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v5, v3, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v1 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v1 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v6, v5, v1 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v2 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v6, v5, v2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v2, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v3f32: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v3, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0xf800000 +; GISEL-IEEE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v6, -v5, v3, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v8, -v7, v3, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v8 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-IEEE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v1 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v6, v1 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5] +; GISEL-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v6 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v8, s[4:5], 1, v6 +; GISEL-IEEE-NEXT: v_fma_f32 v9, -v8, v6, v1 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v9 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v6, 0x37800000, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-IEEE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v2 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v4, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v4, v2 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GISEL-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v4 +; GISEL-IEEE-NEXT: v_fma_f32 v6, -v3, v4, v2 +; GISEL-IEEE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v4 +; GISEL-IEEE-NEXT: v_fma_f32 v8, -v7, v4, v2 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v8 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v2, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_v3f32: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s6, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v3, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, v0, v3 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v5, -v3, v4, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v4, v4, v5, v4 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v4, v4, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v6, v3, v4 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v4, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v5 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5] +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, v1, v4 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v4, v3, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v6, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v4, v4, v6, v4 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v6, 0x4f800000, v2 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e64 s[4:5], s6, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; SDAG-DAZ-NEXT: v_fma_f32 v7, -v3, v3, v1 +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v6, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v7, v4, v3 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v1, v5 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, v2, v6 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0.5, v6 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v4, v3, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v6, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v7, -v3, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, v4, v6, v4 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v7, v4, v3 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v2, v5 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_v3f32: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v3, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v4, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, v0, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v6, -v3, v5, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v5, v5, v6, v5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v6, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v6, -v5, v5, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v6, v3, v5 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, 0x4f800000, v1 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v5, v1 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v6, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v6 +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5] +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, v1, v5 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, 0.5, v5 +; GISEL-DAZ-NEXT: v_fma_f32 v7, -v5, v3, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v7, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v5, v5, v7, v5 +; GISEL-DAZ-NEXT: v_fma_f32 v7, -v3, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v7, v5, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v7, 0x4f800000, v2 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e64 s[4:5], v4, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v4, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v1, v6 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, v2, v4 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; GISEL-DAZ-NEXT: v_fma_f32 v5, -v4, v3, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v5, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v4, v4, v5, v4 +; GISEL-DAZ-NEXT: v_fma_f32 v5, -v3, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v5, v4, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %x) + ret <3 x float> %result +} + +; fpmath should be ignored +define float @v_sqrt_f32_ulp05(float %x) { +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp05: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp05: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_ulp05: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_ulp05: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.sqrt.f32(float %x), !fpmath !0 + ret float %result } ; fpmath should be used with DAZ only define float @v_sqrt_f32_ulp1(float %x) { -; GCN-LABEL: v_sqrt_f32_ulp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp1: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp1: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_ulp1: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !1 ret float %result } ; fpmath should always be used define float @v_sqrt_f32_ulp2(float %x) { -; GCN-LABEL: v_sqrt_f32_ulp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_ulp2: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !2 ret float %result } ; fpmath should always be used define float @v_sqrt_f32_ulp25(float %x) { -; GCN-LABEL: v_sqrt_f32_ulp25: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp25: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp25: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_ulp25: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !3 ret float %result } ; fpmath should always be used define float @v_sqrt_f32_ulp3(float %x) { -; GCN-LABEL: v_sqrt_f32_ulp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp3: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp3: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_ulp3: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !4 ret float %result } define float @v_sqrt_f32_ulp2_fabs(float %x) { -; GCN-LABEL: v_sqrt_f32_ulp2_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e64 v0, |v0| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_fabs: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, s[4:5] +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_fabs: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, s[4:5] +; GISEL-IEEE-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, s[4:5] +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_fabs: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, |v0| +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %x.fabs = call float @llvm.fabs.f32(float %x) %result = call float @llvm.sqrt.f32(float %x.fabs), !fpmath !2 ret float %result } define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) { -; GCN-LABEL: v_sqrt_v2f32_ulp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp1: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s6, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v2, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SDAG-IEEE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SDAG-IEEE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v4, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v2, v4, v1 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; SDAG-IEEE-NEXT: v_fma_f32 v4, -v5, v4, v1 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v1, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp1: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v2, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0xf800000 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v2, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v6, v2, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-IEEE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v1 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v3, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v3, v1 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v2, v3, v1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v6, v3, v1 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v1, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp1: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1 ret <2 x float> %result } ; fpmath should always be used define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) { -; GCN-LABEL: v_sqrt_v2f32_ulp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[4:5] +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[4:5] +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[4:5] +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2 ret <2 x float> %result } define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) { -; GCN-LABEL: v_sqrt_v2f32_ulp1_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e64 v0, |v0| -; GCN-NEXT: v_sqrt_f32_e64 v1, |v1| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp1_fabs: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s6, 0xf800000 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0x4f800000 +; SDAG-IEEE-NEXT: v_mul_f32_e64 v2, |v0|, s7 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, |v0|, v2, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v2, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SDAG-IEEE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SDAG-IEEE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e64 v4, |v1|, s7 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s6 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, |v1|, v4, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v4, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v2, v4, v1 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; SDAG-IEEE-NEXT: v_fma_f32 v4, -v5, v4, v1 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v1, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp1_fabs: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; GISEL-IEEE-NEXT: s_mov_b32 s5, 0x4f800000 +; GISEL-IEEE-NEXT: v_mul_f32_e64 v2, |v0|, s5 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, s4, |v0| +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, |v0|, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v2, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0x4f800000 +; GISEL-IEEE-NEXT: v_mul_f32_e64 v4, |v1|, v4 +; GISEL-IEEE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v6, -v5, v2, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v8, -v7, v2, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v8 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v5, 0x37800000, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, v3, |v1| +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, |v1|, v4, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v3, v1 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v4, -v2, v3, v1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v6, v3, v1 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v1, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp1_fabs: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, |v0| +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v1, |v1| +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x) %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !1 ret <2 x float> %result @@ -396,12 +2214,50 @@ ; fpmath should always be used define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) { -; GCN-LABEL: v_sqrt_v2f32_ulp2_fabs: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e64 v0, |v0| -; GCN-NEXT: v_sqrt_f32_e64 v1, |v1| -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_fabs: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s6, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[6:7], |v1|, s6 +; SDAG-IEEE-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[6:7] +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e64 v1, |v1|, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[4:5] +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[6:7] +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_fabs: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s6, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[6:7], |v1|, s6 +; GISEL-IEEE-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[6:7] +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e64 v1, |v1|, v2 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[4:5] +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[6:7] +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_fabs: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, |v0| +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v1, |v1| +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x) %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !2 ret <2 x float> %result @@ -456,8 +2312,15 @@ ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp: ; SDAG-IEEE: ; %bb.0: ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc @@ -470,11 +2333,17 @@ ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp: ; GISEL-IEEE: ; %bb.0: ; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 @@ -532,24 +2401,56 @@ } define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) { -; GCN-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv: ; SDAG-DAZ: ; %bb.0: @@ -584,31 +2485,63 @@ } define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) { -; GCN-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv: -; SDAG-DAZ: ; %bb.0: -; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x6f800000 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x6f800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x2f800000 ; SDAG-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2 @@ -639,8 +2572,15 @@ ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp: ; SDAG-IEEE: ; %bb.0: ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 ; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc @@ -654,11 +2594,17 @@ ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp: ; GISEL-IEEE: ; %bb.0: ; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v0 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v3, vcc +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 @@ -666,47 +2612,86 @@ ; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 ; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp: -; SDAG-DAZ: ; %bb.0: -; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; SDAG-DAZ-NEXT: v_rcp_f32_e32 v0, v0 -; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0 -; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp: -; GISEL-DAZ: ; %bb.0: -; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-DAZ-NEXT: v_rsq_f32_e32 v0, v0 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0 -; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4 %result = fdiv arcp contract float %y, %sqrt, !fpmath !3 ret float %result } define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) { -; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[4:5] +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[4:5] +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, s[4:5] +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp: ; GCN-DAZ: ; %bb.0: @@ -754,37 +2739,93 @@ } define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x float> %y) { -; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v2 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_mul_f32_e32 v4, v5, v4 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v3 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s4 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 -; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 -; GCN-IEEE-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, s[4:5] +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, v5, v4 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v3 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s4 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, vcc +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, s[4:5] +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, s[4:5] +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v2 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v4, v5, v4 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v3 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s4 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv: ; GCN-DAZ: ; %bb.0: @@ -812,49 +2853,88 @@ } define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2 x float> %y) { -; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 -; GCN-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-IEEE-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 -; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 -; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, s[4:5] +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp: -; SDAG-DAZ: ; %bb.0: -; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 -; SDAG-DAZ-NEXT: v_rcp_f32_e32 v0, v0 -; SDAG-DAZ-NEXT: v_rcp_f32_e32 v1, v1 -; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0 -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, v3, v1 -; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, vcc +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, s[4:5] +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, -16, s[4:5] +; GISEL-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp: -; GISEL-DAZ: ; %bb.0: -; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-DAZ-NEXT: v_rsq_f32_e32 v0, v0 -; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v1 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, v3, v1 -; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0 +; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v3, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4 %result = fdiv arcp contract <2 x float> %y, %sqrt, !fpmath !3 ret <2 x float> %result @@ -897,31 +2977,106 @@ } define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !2 ret float %result } define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub) %x) { -; GCN-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2 ret float %result } define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_negdenormal: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_negdenormal: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_negdenormal: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_known_never_negdenormal: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !2 ret float %result } @@ -937,75 +3092,581 @@ } define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) { -; GCN-LABEL: v_sqrt_f32_ninf_known_never_zero: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_ninf_known_never_zero: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_ninf_known_never_zero: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_ninf_known_never_zero: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_ninf_known_never_zero: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call ninf float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_zero: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_zero_never_inf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_inf: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_inf: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_inf: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_inf: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_zero_never_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_ninf: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_ninf: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_zero_never_pinf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_pinf: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_pinf: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_pinf: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_pinf: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x) ret float %result } define float @v_sqrt_f32_frexp_src(float %x) { -; SDAG-LABEL: v_sqrt_f32_frexp_src: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0 -; SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SDAG-NEXT: v_sqrt_f32_e32 v0, v0 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_frexp_src: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: v_sqrt_f32_frexp_src: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GISEL-NEXT: v_frexp_mant_f32_e32 v1, v0 -; GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GISEL-NEXT: v_sqrt_f32_e32 v0, v0 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GISEL-IEEE-LABEL: v_sqrt_f32_frexp_src: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_sqrt_f32_frexp_src: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x7f800000 +; SDAG-DAZ-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_sqrt_f32_frexp_src: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GISEL-DAZ-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %x) %frexp.mant = extractvalue { float, i32 } %frexp, 0 %result = call float @llvm.sqrt.f32(float %frexp.mant) @@ -1039,51 +3700,208 @@ } define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero ninf) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !2 ret float %result } define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) { -; GCN-LABEL: v_sqrt_f32_known_never_ninf_ulp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_ninf_ulp2: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_ninf_ulp2: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_known_never_ninf_ulp2: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.sqrt.f32(float %x), !fpmath !2 ret float %result } define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) { -; GCN-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-IEEE-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-DAZ-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2 ret float %result } define float @v_elim_redun_check_ult_sqrt(float %in) { -; SDAG-LABEL: v_elim_redun_check_ult_sqrt: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f32_e32 v0, v0 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_elim_redun_check_ult_sqrt: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SDAG-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: v_elim_redun_check_ult_sqrt: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f32_e32 v1, v0 -; GISEL-NEXT: v_bfrev_b32_e32 v2, 1 -; GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GISEL-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GISEL-IEEE-LABEL: v_elim_redun_check_ult_sqrt: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v2, v1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v2 +; GISEL-IEEE-NEXT: v_fma_f32 v6, -v5, v2, v1 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v1, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v2, 1 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_elim_redun_check_ult_sqrt: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: s_mov_b32 s4, 0xf800000 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SDAG-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_elim_redun_check_ult_sqrt: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0xf800000 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, v1, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v2, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v4, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v4, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v4, -v3, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v4, v2, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v3, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v1, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v2, 1 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp ult float %in, -0.000000e+00 %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt @@ -1091,21 +3909,58 @@ } define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) { -; SDAG-LABEL: v_elim_redun_check_ult_sqrt_ulp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f32_e32 v0, v0 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-IEEE-LABEL: v_elim_redun_check_ult_sqrt_ulp3: +; SDAG-IEEE: ; %bb.0: +; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 +; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v0, v1 +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; SDAG-IEEE-NEXT: s_brev_b32 s4, 1 +; SDAG-IEEE-NEXT: v_cmp_le_f32_e32 vcc, s4, v0 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: v_elim_redun_check_ult_sqrt_ulp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f32_e32 v1, v0 -; GISEL-NEXT: v_bfrev_b32_e32 v2, 1 -; GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GISEL-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GISEL-IEEE-LABEL: v_elim_redun_check_ult_sqrt_ulp3: +; GISEL-IEEE: ; %bb.0: +; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v0, v1 +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v2, 1 +; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-DAZ-LABEL: v_elim_redun_check_ult_sqrt_ulp3: +; SDAG-DAZ: ; %bb.0: +; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: s_brev_b32 s4, 1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SDAG-DAZ-NEXT: v_cmp_le_f32_e32 vcc, s4, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-DAZ-LABEL: v_elim_redun_check_ult_sqrt_ulp3: +; GISEL-DAZ: ; %bb.0: +; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-DAZ-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v2, 1 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %in), !fpmath !4 %cmp = fcmp ult float %in, -0.000000e+00 %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt @@ -1113,31 +3968,131 @@ } define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) { -; SDAG-LABEL: elim_redun_check_neg0: -; SDAG: ; %bb.0: ; %entry -; SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f32_e32 v0, s2 -; SDAG-NEXT: s_mov_b32 s2, -1 -; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: elim_redun_check_neg0: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_load_dword s3, s[0:1], 0xb -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GISEL-NEXT: v_bfrev_b32_e32 v0, 1 -; GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f32_e32 v1, s3 -; GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; GISEL-NEXT: s_mov_b32 s3, 0xf000 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GISEL-NEXT: s_endpgm +; SDAG-IEEE-LABEL: elim_redun_check_neg0: +; SDAG-IEEE: ; %bb.0: ; %entry +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 +; SDAG-IEEE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SDAG-IEEE-NEXT: s_endpgm +; +; GISEL-IEEE-LABEL: elim_redun_check_neg0: +; GISEL-IEEE: ; %bb.0: ; %entry +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GISEL-IEEE-NEXT: s_endpgm +; +; SDAG-DAZ-LABEL: elim_redun_check_neg0: +; SDAG-DAZ: ; %bb.0: ; %entry +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SDAG-DAZ-NEXT: s_endpgm +; +; GISEL-DAZ-LABEL: elim_redun_check_neg0: +; GISEL-DAZ: ; %bb.0: ; %entry +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GISEL-DAZ-NEXT: s_endpgm entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp olt float %in, -0.000000e+00 @@ -1147,30 +4102,129 @@ } define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) { -; SDAG-LABEL: elim_redun_check_pos0: -; SDAG: ; %bb.0: ; %entry -; SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f32_e32 v0, s2 -; SDAG-NEXT: s_mov_b32 s2, -1 -; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: elim_redun_check_pos0: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_load_dword s3, s[0:1], 0xb -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f32_e32 v0, s3 -; GISEL-NEXT: v_cmp_lt_f32_e64 vcc, s3, 0 -; GISEL-NEXT: s_mov_b32 s3, 0xf000 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GISEL-NEXT: s_endpgm +; SDAG-IEEE-LABEL: elim_redun_check_pos0: +; SDAG-IEEE: ; %bb.0: ; %entry +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 +; SDAG-IEEE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SDAG-IEEE-NEXT: s_endpgm +; +; GISEL-IEEE-LABEL: elim_redun_check_pos0: +; GISEL-IEEE: ; %bb.0: ; %entry +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GISEL-IEEE-NEXT: s_endpgm +; +; SDAG-DAZ-LABEL: elim_redun_check_pos0: +; SDAG-DAZ: ; %bb.0: ; %entry +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SDAG-DAZ-NEXT: s_endpgm +; +; GISEL-DAZ-LABEL: elim_redun_check_pos0: +; GISEL-DAZ: ; %bb.0: ; %entry +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GISEL-DAZ-NEXT: s_endpgm entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp olt float %in, 0.000000e+00 @@ -1180,31 +4234,131 @@ } define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) { -; SDAG-LABEL: elim_redun_check_ult: -; SDAG: ; %bb.0: ; %entry -; SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f32_e32 v0, s2 -; SDAG-NEXT: s_mov_b32 s2, -1 -; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: elim_redun_check_ult: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_load_dword s3, s[0:1], 0xb -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GISEL-NEXT: v_bfrev_b32_e32 v0, 1 -; GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f32_e32 v1, s3 -; GISEL-NEXT: v_cmp_nge_f32_e32 vcc, s3, v0 -; GISEL-NEXT: s_mov_b32 s3, 0xf000 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GISEL-NEXT: s_endpgm +; SDAG-IEEE-LABEL: elim_redun_check_ult: +; SDAG-IEEE: ; %bb.0: ; %entry +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 +; SDAG-IEEE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; SDAG-IEEE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SDAG-IEEE-NEXT: s_endpgm +; +; GISEL-IEEE-LABEL: elim_redun_check_ult: +; GISEL-IEEE: ; %bb.0: ; %entry +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 +; GISEL-IEEE-NEXT: v_fma_f32 v5, -v4, v1, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GISEL-IEEE-NEXT: s_endpgm +; +; SDAG-DAZ-LABEL: elim_redun_check_ult: +; SDAG-DAZ: ; %bb.0: ; %entry +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; SDAG-DAZ-NEXT: v_fma_f32 v1, v4, v1, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SDAG-DAZ-NEXT: s_endpgm +; +; GISEL-DAZ-LABEL: elim_redun_check_ult: +; GISEL-DAZ: ; %bb.0: ; %entry +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v2, v2, v3, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v1, v3, v1 +; GISEL-DAZ-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v1, v3, v1, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GISEL-DAZ-NEXT: s_endpgm entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp ult float %in, -0.000000e+00 @@ -1214,37 +4368,203 @@ } define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { -; SDAG-LABEL: elim_redun_check_v2: -; SDAG: ; %bb.0: ; %entry -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SDAG-NEXT: s_mov_b32 s6, -1 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f32_e32 v1, s3 -; SDAG-NEXT: v_sqrt_f32_e32 v0, s2 -; SDAG-NEXT: s_mov_b32 s4, s0 -; SDAG-NEXT: s_mov_b32 s5, s1 -; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: elim_redun_check_v2: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GISEL-NEXT: s_mov_b32 s4, 0x80000000 -; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f32_e32 v2, s2 -; GISEL-NEXT: v_sqrt_f32_e32 v4, s3 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_mov_b32 s3, 0xf000 -; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GISEL-NEXT: s_endpgm +; SDAG-IEEE-LABEL: elim_redun_check_v2: +; SDAG-IEEE: ; %bb.0: ; %entry +; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 +; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 +; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 +; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v5, v3, v2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v2, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v5, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v3, v5, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SDAG-IEEE-NEXT: s_endpgm +; +; GISEL-IEEE-LABEL: elim_redun_check_v2: +; GISEL-IEEE: ; %bb.0: ; %entry +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-IEEE-NEXT: s_mov_b32 s0, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0xf800000 +; GISEL-IEEE-NEXT: s_mov_b32 s2, 0x80000000 +; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, s6, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, s7, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v5, s[0:1], -1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v6, -v5, v3, v2 +; GISEL-IEEE-NEXT: v_add_i32_e64 v7, s[0:1], 1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v8, -v7, v3, v2 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v8 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v6, s7 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v4, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e64 s[0:1], v2, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GISEL-IEEE-NEXT: v_add_i32_e64 v3, s[0:1], -1, v4 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v3, v4, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v4 +; GISEL-IEEE-NEXT: v_fma_f32 v9, -v8, v4, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v9 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GISEL-IEEE-NEXT: s_endpgm +; +; SDAG-DAZ-LABEL: elim_redun_check_v2: +; SDAG-DAZ: ; %bb.0: ; %entry +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-DAZ-NEXT: s_mov_b32 s6, -1 +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, s3, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v3, s3 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v3, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: s_mov_b32 s4, s0 +; SDAG-DAZ-NEXT: s_mov_b32 s5, s1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, v2, v3 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v5, -v3, v4, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v4, v4, v5, v4 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v4, v4, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v6, v3, v4 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v4, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v4, v0 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e64 s[0:1], v2, v5 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v1, v3, v2, s[0:1] +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v4 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v4 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v3, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v4, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v4, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v6, v3, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-DAZ-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SDAG-DAZ-NEXT: s_endpgm +; +; GISEL-DAZ-LABEL: elim_redun_check_v2: +; GISEL-DAZ: ; %bb.0: ; %entry +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s0, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v4, 0xf800000 +; GISEL-DAZ-NEXT: s_mov_b32 s2, 0x80000000 +; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, s6, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s0, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v3, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, s7, v0 +; GISEL-DAZ-NEXT: s_mov_b32 s6, -1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, v2, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v6, -v3, v5, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v5, v5, v6, v5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v6, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v6, -v5, v5, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v6, v3, v5 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v5, s7 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v6, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e64 s[0:1], v2, v6 +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, v0, v4 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; GISEL-DAZ-NEXT: v_fma_f32 v7, -v4, v3, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v7, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v4, v4, v7, v4 +; GISEL-DAZ-NEXT: v_fma_f32 v7, -v3, v3, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v7, v4, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v6 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-DAZ-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GISEL-DAZ-NEXT: s_endpgm entry: %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) %cmp = fcmp olt <2 x float> %in, @@ -1254,37 +4574,203 @@ } define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { -; SDAG-LABEL: elim_redun_check_v2_ult: -; SDAG: ; %bb.0: ; %entry -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SDAG-NEXT: s_mov_b32 s6, -1 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_sqrt_f32_e32 v1, s3 -; SDAG-NEXT: v_sqrt_f32_e32 v0, s2 -; SDAG-NEXT: s_mov_b32 s4, s0 -; SDAG-NEXT: s_mov_b32 s5, s1 -; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: elim_redun_check_v2_ult: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GISEL-NEXT: s_mov_b32 s4, 0x80000000 -; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_sqrt_f32_e32 v2, s2 -; GISEL-NEXT: v_sqrt_f32_e32 v4, s3 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-NEXT: v_cmp_nle_f32_e32 vcc, s4, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GISEL-NEXT: v_cmp_nle_f32_e32 vcc, s4, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_mov_b32 s3, 0xf000 -; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GISEL-NEXT: s_endpgm +; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: +; SDAG-IEEE: ; %bb.0: ; %entry +; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 +; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 +; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 +; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v5, v3, v2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] +; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v2, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v3, -v2, v5, v0 +; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SDAG-IEEE-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SDAG-IEEE-NEXT: v_fma_f32 v5, -v3, v5, v0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v5 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-IEEE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SDAG-IEEE-NEXT: s_endpgm +; +; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: +; GISEL-IEEE: ; %bb.0: ; %entry +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-IEEE-NEXT: s_mov_b32 s0, 0xf800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0xf800000 +; GISEL-IEEE-NEXT: s_mov_b32 s2, 0x80000000 +; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v2, s6, v0 +; GISEL-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, s7, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: v_add_i32_e64 v5, s[0:1], -1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v6, -v5, v3, v2 +; GISEL-IEEE-NEXT: v_add_i32_e64 v7, s[0:1], 1, v3 +; GISEL-IEEE-NEXT: v_fma_f32 v8, -v7, v3, v2 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v8 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v6, s7 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v4, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v5, 0x260 +; GISEL-IEEE-NEXT: v_cmp_class_f32_e64 s[0:1], v2, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GISEL-IEEE-NEXT: v_add_i32_e64 v3, s[0:1], -1, v4 +; GISEL-IEEE-NEXT: v_fma_f32 v7, -v3, v4, v0 +; GISEL-IEEE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v4 +; GISEL-IEEE-NEXT: v_fma_f32 v9, -v8, v4, v0 +; GISEL-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v7 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v9 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[0:1] +; GISEL-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GISEL-IEEE-NEXT: v_cmp_nle_f32_e32 vcc, s2, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GISEL-IEEE-NEXT: v_cmp_nle_f32_e32 vcc, s2, v6 +; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GISEL-IEEE-NEXT: s_endpgm +; +; SDAG-DAZ-LABEL: elim_redun_check_v2_ult: +; SDAG-DAZ: ; %bb.0: ; %entry +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-DAZ-NEXT: s_mov_b32 s6, -1 +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, s3, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v3, s3 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v3, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: s_mov_b32 s4, s0 +; SDAG-DAZ-NEXT: s_mov_b32 s5, s1 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, v2, v3 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v5, -v3, v4, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v4, v4, v5, v4 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v4, v4, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v5, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v6, v3, v4 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SDAG-DAZ-NEXT: v_mov_b32_e32 v4, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; SDAG-DAZ-NEXT: v_rsq_f32_e32 v4, v0 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v5, 0x260 +; SDAG-DAZ-NEXT: v_cmp_class_f32_e64 s[0:1], v2, v5 +; SDAG-DAZ-NEXT: v_cndmask_b32_e64 v1, v3, v2, s[0:1] +; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v4 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v4 +; SDAG-DAZ-NEXT: v_fma_f32 v4, -v3, v2, 0.5 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v2, v4, v2 +; SDAG-DAZ-NEXT: v_fma_f32 v6, -v2, v2, v0 +; SDAG-DAZ-NEXT: v_fma_f32 v3, v3, v4, v3 +; SDAG-DAZ-NEXT: v_fma_f32 v2, v6, v3, v2 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG-DAZ-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SDAG-DAZ-NEXT: s_endpgm +; +; GISEL-DAZ-LABEL: elim_redun_check_v2_ult: +; GISEL-DAZ: ; %bb.0: ; %entry +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s0, 0xf800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v4, 0xf800000 +; GISEL-DAZ-NEXT: s_mov_b32 s2, 0x80000000 +; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, s6, v0 +; GISEL-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s0, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v3, v2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, s7, v0 +; GISEL-DAZ-NEXT: s_mov_b32 s6, -1 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, v2, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v6, -v3, v5, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v5, v5, v6, v5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v6, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v6, -v5, v5, v2 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v6, v3, v5 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v5, 0x37800000, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v5, s7 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GISEL-DAZ-NEXT: v_rsq_f32_e32 v4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v6, 0x260 +; GISEL-DAZ-NEXT: v_cmp_class_f32_e64 s[0:1], v2, v6 +; GISEL-DAZ-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GISEL-DAZ-NEXT: v_mul_f32_e32 v3, v0, v4 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; GISEL-DAZ-NEXT: v_fma_f32 v7, -v4, v3, 0.5 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v3, v7, v3 +; GISEL-DAZ-NEXT: v_fma_f32 v4, v4, v7, v4 +; GISEL-DAZ-NEXT: v_fma_f32 v7, -v3, v3, v0 +; GISEL-DAZ-NEXT: v_fma_f32 v3, v7, v4, v3 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v6 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GISEL-DAZ-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GISEL-DAZ-NEXT: v_cmp_nle_f32_e32 vcc, s2, v1 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GISEL-DAZ-NEXT: v_cmp_nle_f32_e32 vcc, s2, v5 +; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-DAZ-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GISEL-DAZ-NEXT: s_endpgm entry: %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) %cmp = fcmp ult <2 x float> %in, Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -65,8 +65,15 @@ } ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32: -; SI: v_sqrt_f32_e32 -; SI: v_rcp_f32_e32 +; SI: v_mul_f32 +; SI: v_rsq_f32 +; SI: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_rcp_f32 define amdgpu_kernel void @safe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #1 { %sqrt = call contract float @llvm.sqrt.f32(float %src) %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt) Index: llvm/test/CodeGen/AMDGPU/rsq.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -65,30 +65,63 @@ ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc @@ -96,30 +129,47 @@ ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -171,11 +221,29 @@ ; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v3, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-SAFE-NEXT: s_endpgm @@ -183,36 +251,74 @@ ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, -1 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s0 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, -1 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-UNSAFE: ; %bb.0: @@ -308,24 +414,40 @@ ; ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, 0 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[2:3] -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v2 +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v2, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v5, v7, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v7, v8, v7 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v8, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v7, v7, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v8, v5, v7 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4 +; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3 ; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -338,31 +460,48 @@ ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, 0 +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[2:3] -; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v5, v2 +; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[0:1], -1, v5 +; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v5 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v9, -v7, v5, v2 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v10, -v8, v5, v2 +; GCN-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v9 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] +; GCN-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v10 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1] +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v4 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4 ; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v5, v3 ; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5] ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5 @@ -371,7 +510,7 @@ ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6 ; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 -; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-IEEE-SAFE-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -440,30 +579,63 @@ ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] @@ -471,30 +643,47 @@ ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -571,30 +760,63 @@ ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] @@ -602,30 +824,47 @@ ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-UNSAFE: ; %bb.0: @@ -671,14 +910,47 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 @@ -692,7 +964,24 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -727,8 +1016,39 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v3, -v0, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -736,9 +1056,42 @@ ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] @@ -758,15 +1111,48 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v3, -v1 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v3 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 @@ -795,7 +1181,23 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -803,7 +1205,24 @@ ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 @@ -818,7 +1237,24 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -855,8 +1291,39 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v5, -v0, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 @@ -866,9 +1333,42 @@ ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5] @@ -890,8 +1390,41 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -930,14 +1463,47 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 @@ -951,7 +1517,24 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -985,8 +1568,38 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -994,9 +1607,41 @@ ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] @@ -1016,15 +1661,47 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v3, -v1 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v3 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 @@ -1052,7 +1729,23 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -1060,7 +1753,24 @@ ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 @@ -1075,7 +1785,24 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -1111,8 +1838,38 @@ ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 @@ -1122,9 +1879,41 @@ ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5] @@ -1146,8 +1935,40 @@ ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -1225,7 +2046,24 @@ ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 @@ -1239,7 +2077,24 @@ ; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 @@ -1276,7 +2131,24 @@ ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 @@ -1290,7 +2162,24 @@ ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 @@ -1325,7 +2214,24 @@ ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 @@ -1339,7 +2245,24 @@ ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0