Index: llvm/docs/ReleaseNotes.rst =================================================================== --- llvm/docs/ReleaseNotes.rst +++ llvm/docs/ReleaseNotes.rst @@ -163,6 +163,10 @@ * llvm.exp2.f32 and llvm.exp.f32 are now lowered accurately. Use llvm.amdgcn.exp2.f32 to access the old behavior for llvm.exp2.f32. +* Implemented new 1ulp IEEE lowering strategy for float reciprocal + which saves 2 instructions. This is used by default for OpenCL on + gfx9+. With ``contract`` flags, this will fold into a 1 ulp rsqrt. + Changes to the ARM Backend -------------------------- Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -133,6 +133,13 @@ /// Return true if \p T is a legal scalar floating point type. bool isLegalFloatingTy(const Type *T) const; + /// Wrapper to pass all the arguments to computeKnownFPClass + KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested, + const Instruction *CtxI) const { + return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI, + DT); + } + /// Promotes uniform binary operation \p I to equivalent 32 bit binary /// operation. /// @@ -233,6 +240,14 @@ Value *matchFractPat(IntrinsicInst &I); Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg); + Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den, + FastMathFlags DivFMF, FastMathFlags SqrtFMF, + const Instruction *CtxI, bool AllowApproxRsq) const; + + Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den, + FastMathFlags FMF, const Instruction *CtxI, + bool AllowInaccurateRcp, bool RcpIsAccurate) const; + public: bool visitFDiv(BinaryOperator &I); @@ -727,30 +742,138 @@ return true; } +/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals. +static Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative, + bool HasFractBug) { + // Same as for 1.0, but expand the sign out of the constant. + // -1.0 / x -> rcp (fneg x) + if (IsNegative) + Src = Builder.CreateFNeg(Src); + + // The rcp instruction doesn't support denormals, so scale the input + // out of the denormal range and convert at the end. + // + // Expand as 2^-n * (1.0 / (x * 2^n)) + + // TODO: Skip scaling if input is known never denormal and the input + // range won't underflow to denormal. The hard part is knowing the + // result. We need a range check, the result could be denormal for + // 0x1p+126 < den <= 0x1p+127. + + Type *Ty = Src->getType(); + Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp, + {Ty, Builder.getInt32Ty()}, Src); + Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0}); + + // Bypass the bug workaround for the exponent result since it doesn't matter. + // TODO: Does the bug workaround even really need to consider the exponent + // result? It's unspecified by the spec. + + Value *FrexpExp = + HasFractBug ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp, + {Builder.getInt32Ty(), Ty}, Src) + : Builder.CreateExtractValue(Frexp, {1}); + + Value *ScaleFactor = Builder.CreateNeg(FrexpExp); + Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant); + return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, + {Rcp, ScaleFactor}); +} + +/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals. +static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, + bool IsNegative) { + // bool need_scale = x < 0x1p-126f; + // float input_scale = need_scale ? 0x1.0p+24f : 1.0f; + // float output_scale = need_scale ? 0x1.0p+12f : 1.0f; + // rsq(x * input_scale) * output_scale; + + Type *Ty = Src->getType(); + APFloat SmallestNormal = + APFloat::getSmallestNormalized(Ty->getFltSemantics()); + Value *NeedScale = + Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal)); + Constant *One = ConstantFP::get(Ty, 1.0); + Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24); + Constant *OutputScale = + ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12); + + Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One); + + Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor); + Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput); + Value *OutputScaleFactor = Builder.CreateSelect( + NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One); + + return Builder.CreateFMul(Rsq, OutputScaleFactor); +} + +Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( + IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, + FastMathFlags SqrtFMF, const Instruction *CtxI, bool AllowApproxRsq) const { + // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. + if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) + return nullptr; + + const ConstantFP *CLHS = dyn_cast(Num); + if (!CLHS) + return nullptr; + + Type *Ty = Den->getType(); + assert(Ty->isFloatTy()); + + bool IsNegative = false; + if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) { + // Add in the sqrt flags. + IRBuilder<>::FastMathFlagGuard Guard(Builder); + DivFMF |= SqrtFMF; + Builder.setFastMathFlags(DivFMF); + + if (HasFP32DenormalFlush || AllowApproxRsq || + computeKnownFPClass(Den, fcSubnormal, CtxI).isKnownNeverSubnormal()) { + Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); + // -1.0 / sqrt(x) -> fneg(rsq(x)) + return IsNegative ? Builder.CreateFNeg(Result) : Result; + } + + return emitRsqIEEE1ULP(Builder, Den, IsNegative); + } + + return nullptr; +} + // Optimize fdiv with rcp: // // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is // allowed with unsafe-fp-math or afn. // -// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. -static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, - bool RcpIsAccurate, IRBuilder<> &Builder, - Module *Mod) { - - if (!AllowInaccurateRcp && !RcpIsAccurate) - return nullptr; +// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 +Value *AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, + Value *Num, Value *Den, + FastMathFlags FMF, + const Instruction *CtxI, + bool AllowInaccurateRcp, + bool RcpIsAccurate) const { + assert(AllowInaccurateRcp || RcpIsAccurate); Type *Ty = Den->getType(); + assert(Ty->isFloatTy()); + if (const ConstantFP *CLHS = dyn_cast(Num)) { - if (AllowInaccurateRcp || RcpIsAccurate) { - if (CLHS->isExactlyValue(1.0)) { - Function *Decl = Intrinsic::getDeclaration( - Mod, Intrinsic::amdgcn_rcp, Ty); + bool IsNegative = false; + if (CLHS->isExactlyValue(1.0) || + (IsNegative = CLHS->isExactlyValue(-1.0))) { + Value *Src = Den; + + if (HasFP32DenormalFlush || AllowInaccurateRcp) { + // -1.0 / x -> 1.0 / fneg(x) + if (IsNegative) + Src = Builder.CreateFNeg(Src); // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK + // to use it as long as we aren't trying to use denormals. // // v_rcp_f16 and v_rsq_f16 DO support denormals. @@ -758,30 +881,29 @@ // insert rsq intrinsic here. // 1.0 / x -> rcp(x) - return Builder.CreateCall(Decl, { Den }); + return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src); } - // Same as for 1.0, but expand the sign out of the constant. - if (CLHS->isExactlyValue(-1.0)) { - Function *Decl = Intrinsic::getDeclaration( - Mod, Intrinsic::amdgcn_rcp, Ty); - - // -1.0 / x -> rcp (fneg x) - Value *FNeg = Builder.CreateFNeg(Den); - return Builder.CreateCall(Decl, { FNeg }); - } + // TODO: If the input isn't denormal, and we know the input exponent isn't + // big enough to introduce a denormal we can avoid the scaling. + return emitRcpIEEE1ULP(Builder, Src, IsNegative, ST->hasFractBug()); } } - if (AllowInaccurateRcp) { - Function *Decl = Intrinsic::getDeclaration( - Mod, Intrinsic::amdgcn_rcp, Ty); - - // Turn into multiply by the reciprocal. + if (FMF.allowReciprocal()) { // x / y -> x * (1.0 / y) - Value *Recip = Builder.CreateCall(Decl, { Den }); + + // TODO: Could avoid denormal scaling and use raw rcp if we knew the output + // will never underflow. + if (AllowInaccurateRcp || HasFP32DenormalFlush) { + Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den); + return Builder.CreateFMul(Num, Recip); + } + + Value *Recip = emitRcpIEEE1ULP(Builder, Den, false, ST->hasFractBug()); return Builder.CreateFMul(Num, Recip); } + return nullptr; } @@ -845,21 +967,36 @@ const float ReqdAccuracy = FPOp->getFPAccuracy(); // Inaccurate rcp is allowed with unsafe-fp-math or afn. - FastMathFlags FMF = FPOp->getFastMathFlags(); - const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc(); + FastMathFlags DivFMF = FPOp->getFastMathFlags(); + const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); + bool AllowApproxRsq = false; + + FastMathFlags SqrtFMF; // rcp_f16 is accurate to 0.51 ulp. // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. // rcp_f64 is never accurate. - const bool RcpIsAccurate = HasFP32DenormalFlush && ReqdAccuracy >= 1.0f; + const bool RcpIsAccurate = ReqdAccuracy >= 1.0f; + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + Value *RsqOp = nullptr; + auto *DenII = dyn_cast(Den); + if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && + DenII->hasOneUse() && (RcpIsAccurate || AllowInaccurateRcp)) { + const auto *SqrtOp = cast(DenII); + AllowApproxRsq = HasUnsafeFPMath || SqrtOp->hasApproxFunc(); + + if (AllowApproxRsq || SqrtOp->getFPAccuracy() >= 1.0f) { + SqrtFMF = SqrtOp->getFastMathFlags(); + RsqOp = SqrtOp->getOperand(0); + } + } IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); - Builder.setFastMathFlags(FMF); + Builder.setFastMathFlags(DivFMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - Value *Num = FDiv.getOperand(0); - Value *Den = FDiv.getOperand(1); - Value *NewFDiv = nullptr; if (auto *VT = dyn_cast(FDiv.getType())) { NewFDiv = PoisonValue::get(VT); @@ -868,32 +1005,71 @@ // constant. This works when the scalarizer pass is run first. for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); - Value *DenEltI = Builder.CreateExtractElement(Den, I); - // Try rcp first. - Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp, - RcpIsAccurate, Builder, Mod); - if (!NewElt) // Try fdiv.fast. - NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, - HasFP32DenormalFlush, Builder, Mod); - if (!NewElt) // Keep the original. - NewElt = Builder.CreateFDiv(NumEltI, DenEltI); + + Value *NewElt = nullptr; + if (RsqOp) { + Value *DenEltI = Builder.CreateExtractElement(RsqOp, I); + NewElt = optimizeWithRsq(Builder, NumEltI, DenEltI, DivFMF, SqrtFMF, + &FDiv, AllowApproxRsq); + if (!NewElt) { + // TODO: Avoid inserting dead extract in the first place + if (Instruction *Extract = dyn_cast(DenEltI)) + Extract->eraseFromParent(); + } + } + + Value *DenEltI = nullptr; + + if (!NewElt && (RcpIsAccurate || AllowInaccurateRcp)) { + DenEltI = Builder.CreateExtractElement(Den, I); + + // Try rcp first. + NewElt = optimizeWithRcp(Builder, NumEltI, DenEltI, DivFMF, + cast(FPOp), AllowInaccurateRcp, + RcpIsAccurate); + if (!NewElt) // Try fdiv.fast. + NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, + HasFP32DenormalFlush, Builder, Mod); + } + + if (!NewElt) { + if (!DenEltI) + DenEltI = Builder.CreateExtractElement(Den, I); + + // Keep the original, but scalarized. + Value *ScalarDiv = Builder.CreateFDiv(NumEltI, DenEltI); + if (auto *ScalarDivInst = dyn_cast(ScalarDiv)) + ScalarDivInst->copyMetadata(FDiv); + NewElt = ScalarDiv; + } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { // Scalar FDiv. - // Try rcp first. - NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate, - Builder, Mod); - if (!NewFDiv) { // Try fdiv.fast. - NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, - HasFP32DenormalFlush, Builder, Mod); + if (RsqOp) { + NewFDiv = optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, + cast(FPOp), AllowApproxRsq); + } + + if (!NewFDiv) { + // Try rcp first. + if (RcpIsAccurate || AllowInaccurateRcp) { + NewFDiv = + optimizeWithRcp(Builder, Num, Den, DivFMF, cast(FPOp), + AllowInaccurateRcp, RcpIsAccurate); + } + + if (!NewFDiv) { // Try fdiv.fast. + NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, + HasFP32DenormalFlush, Builder, Mod); + } } } if (NewFDiv) { FDiv.replaceAllUsesWith(NewFDiv); NewFDiv->takeName(&FDiv); - FDiv.eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo); } return !!NewFDiv; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -745,18 +745,18 @@ } define float @v_rcp_f32_ulp25(float %x) { -; GCN-IEEE-LABEL: v_rcp_f32_ulp25: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_mov_b32_e32 v1, 0x6f800000 -; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, v1 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-IEEE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_f32_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-FLUSH-LABEL: v_rcp_f32_ulp25: ; GCN-FLUSH: ; %bb.0: @@ -767,12 +767,11 @@ ; GFX10-IEEE-LABEL: v_rcp_f32_ulp25: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v0| -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-IEEE-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_rcp_f32_ulp25: @@ -784,15 +783,13 @@ ; GFX11-IEEE-LABEL: v_rcp_f32_ulp25: ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-IEEE-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v0| -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s0 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLUSH-LABEL: v_rcp_f32_ulp25: @@ -805,49 +802,14 @@ } define float @v_fdiv_f32_afn_ulp25(float %a, float %b) { -; GCN-LABEL: v_fdiv_f32_afn_ulp25: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_f32_afn_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_f32_afn_ulp25: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv afn float %a, %b, !fpmath !0 - ret float %fdiv -} - -define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) { -; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_arcp_ulp25: -; GFX6-IEEE-FASTFMA: ; %bb.0: -; GFX6-IEEE-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_fdiv_f32_afn_ulp25: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GCN-FLUSH-LABEL: v_fdiv_f32_afn_ulp25: ; GCN-FLUSH: ; %bb.0: ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0x6f800000 @@ -860,55 +822,14 @@ ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_arcp_ulp25: -; GFX6-IEEE-SLOWFMA: ; %bb.0: -; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: -; GFX89-IEEE: ; %bb.0: -; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX10-IEEE-LABEL: v_fdiv_f32_afn_ulp25: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX10-FLUSH-LABEL: v_fdiv_f32_afn_ulp25: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1| @@ -919,28 +840,15 @@ ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX11-IEEE-LABEL: v_fdiv_f32_afn_ulp25: ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX11-FLUSH-LABEL: v_fdiv_f32_afn_ulp25: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v1| @@ -952,6 +860,71 @@ ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn float %a, %b, !fpmath !0 + ret float %fdiv +} + +define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) { +; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v1, 0, v1 +; GFX10-IEEE-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v1, 0, v1 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float %a, %b, !fpmath !0 ret float %fdiv @@ -2167,24 +2140,25 @@ } define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) { -; GCN-IEEE-LABEL: v_rcp_v2f32_ulp25: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc -; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v3 -; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 -; GCN-IEEE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v3, v0 -; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v2, v1 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_v2f32_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-FLUSH-LABEL: v_rcp_v2f32_ulp25: ; GCN-FLUSH: ; %bb.0: @@ -2196,18 +2170,16 @@ ; GFX10-IEEE-LABEL: v_rcp_v2f32_ulp25: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v0| -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 -; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1| -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v1, 0, v1 +; GFX10-IEEE-NEXT: v_ldexp_f32 v0, v2, v0 +; GFX10-IEEE-NEXT: v_ldexp_f32 v1, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_rcp_v2f32_ulp25: @@ -2220,19 +2192,19 @@ ; GFX11-IEEE-LABEL: v_rcp_v2f32_ulp25: ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-IEEE-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v0| -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0 -; GFX11-IEEE-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v1| -; GFX11-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v0, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v1, 0, v1 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 -; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v2, v0 :: v_dual_mul_f32 v1, v3, v1 +; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v2, v0 +; GFX11-IEEE-NEXT: v_ldexp_f32 v1, v3, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLUSH-LABEL: v_rcp_v2f32_ulp25: @@ -2246,65 +2218,16 @@ } define <2 x float> @v_fdiv_v2f32_afn_ulp25(<2 x float> %a, <2 x float> %b) { -; GCN-LABEL: v_fdiv_v2f32_afn_ulp25: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_v2f32_afn_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_v2f32_afn_ulp25: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v2, v2 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv afn <2 x float> %a, %b, !fpmath !0 - ret <2 x float> %fdiv -} - -define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { -; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_v2f32_arcp_ulp25: -; GFX6-IEEE-FASTFMA: ; %bb.0: -; GFX6-IEEE-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v7, v5, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v7, v6, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v8, -v4, v7, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v7, v8, v5, v7 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v6, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, -v5, v6, 1.0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, v4, v6, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v6, v2, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v7, -v5, v6, v2 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v5, v6, v2 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_fdiv_v2f32_afn_ulp25: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GCN-FLUSH-LABEL: v_fdiv_v2f32_afn_ulp25: ; GCN-FLUSH: ; %bb.0: ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 @@ -2323,91 +2246,16 @@ ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_v2f32_arcp_ulp25: -; GFX6-IEEE-SLOWFMA: ; %bb.0: -; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v8, v4 -; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v9, v5 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX6-IEEE-SLOWFMA-NEXT: s_mov_b64 vcc, s[4:5] -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v5, v5, v9, v11 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v1, v5, v3, v1 -; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: -; GFX89-IEEE: ; %bb.0: -; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 -; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 -; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX10-IEEE-LABEL: v_fdiv_v2f32_afn_ulp25: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX10-FLUSH-LABEL: v_fdiv_v2f32_afn_ulp25: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2| @@ -2424,43 +2272,16 @@ ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX11-IEEE-LABEL: v_fdiv_v2f32_afn_ulp25: ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 -; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX11-FLUSH-LABEL: v_fdiv_v2f32_afn_ulp25: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v2| @@ -2476,6 +2297,96 @@ ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 ; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn <2 x float> %a, %b, !fpmath !0 + ret <2 x float> %fdiv +} + +define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { +; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v2, v4, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v3 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s4 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v3, vcc, 0, v3 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v2, v2, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v3 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v2, 0, v2 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v3, 0, v3 +; GFX10-IEEE-NEXT: v_ldexp_f32 v2, v4, v2 +; GFX10-IEEE-NEXT: v_ldexp_f32 v3, v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v2 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v2, 0, v2 +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v3, 0, v3 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_ldexp_f32 v2, v4, v2 +; GFX11-IEEE-NEXT: v_ldexp_f32 v3, v5, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv @@ -2515,4 +2426,3 @@ !0 = !{float 2.500000e+00} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX6-FLUSH: {{.*}} -; GFX6-IEEE: {{.*}} Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -22,31 +22,79 @@ } define amdgpu_kernel void @fdiv_fpmath_f32(ptr addrspace(1) %out, float %a, float %b) { -; IEEE-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32 -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1:[0-9]+]] { -; IEEE-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] -; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 -; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 -; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) -; IEEE-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] -; IEEE-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP2:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[B]]) -; IEEE-NEXT: [[AFN_MD_25ULP:%.*]] = fmul afn float [[A]], [[TMP2]] -; IEEE-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] -; IEEE-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[ARCP_MD_25ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[ARCP_MD_1ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32 +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1:[0-9]+]] { +; IEEE-GOODFREXP-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] +; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[AFN_MD_25ULP:%.*]] = fdiv afn float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP7]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32 +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1:[0-9]+]] { +; IEEE-BADFREXP-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] +; IEEE-BADFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[AFN_MD_25ULP:%.*]] = fdiv afn float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP7]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP13]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1:[0-9]+]] { @@ -63,14 +111,15 @@ ; DAZ-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) ; DAZ-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] ; DAZ-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP2:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[AFN_MD_25ULP:%.*]] = fmul afn float [[A]], [[TMP2]] +; DAZ-NEXT: [[AFN_MD_25ULP:%.*]] = call afn float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; DAZ-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) +; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP2]] ; DAZ-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP3]] ; DAZ-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -99,7 +148,7 @@ define amdgpu_kernel void @fdiv_fpmath_f32_flags(ptr addrspace(1) %out, float %a, float %b) { ; IEEE-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_flags -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1:[0-9]+]] { ; IEEE-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = fdiv nnan ninf float [[A]], [[B]], !fpmath !2 ; IEEE-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: [[MD_25ULP_NINF_NNAN:%.*]] = fdiv nnan ninf float [[A]], [[B]], !fpmath !0 @@ -152,38 +201,115 @@ } define amdgpu_kernel void @rcp_fdiv_f32_fpmath(ptr addrspace(1) %out, float %x) { -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[NO_MD:%.*]] = fdiv float 1.000000e+00, [[X]] -; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP:%.*]] = fdiv float 1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[X]]) -; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float 1.000000e+00, [[X]], !fpmath !1 -; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[AFN_NO_MD:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[X]]) -; IEEE-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[AFN_25ULP:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[X]]) -; IEEE-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[X]]) -; IEEE-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[X]]) -; IEEE-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[NEG_MD_1ULP:%.*]] = fdiv float -1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: store volatile float [[NEG_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[NEG_MD_25ULP:%.*]] = call float @llvm.amdgcn.fdiv.fast(float -1.000000e+00, float [[X]]) -; IEEE-NEXT: store volatile float [[NEG_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP1:%.*]] = fneg afn float [[X]] -; IEEE-NEXT: [[NEG_AFN_NO_MD:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP1]]) -; IEEE-NEXT: store volatile float [[NEG_AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP2:%.*]] = fneg afn float [[X]] -; IEEE-NEXT: [[NEG_AFN_25ULP:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-NEXT: store volatile float [[NEG_AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP3:%.*]] = fneg fast float [[X]] -; IEEE-NEXT: [[NEG_FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; IEEE-NEXT: store volatile float [[NEG_FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[NO_MD:%.*]] = fdiv float 1.000000e+00, [[X]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float 1.000000e+00, [[X]], !fpmath !1 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[AFN_NO_MD:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[AFN_25ULP:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fneg float [[X]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[NEG_MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fneg float [[X]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[NEG_MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = fneg afn float [[X]] +; IEEE-GOODFREXP-NEXT: [[NEG_AFN_NO_MD:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fneg afn float [[X]] +; IEEE-GOODFREXP-NEXT: [[NEG_AFN_25ULP:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP24]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = fneg fast float [[X]] +; IEEE-GOODFREXP-NEXT: [[NEG_FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[NO_MD:%.*]] = fdiv float 1.000000e+00, [[X]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float 1.000000e+00, [[X]], !fpmath !1 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[AFN_NO_MD:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[AFN_25ULP:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fneg float [[X]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[NEG_MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: store volatile float [[NEG_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fneg float [[X]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[NEG_MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-BADFREXP-NEXT: store volatile float [[NEG_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = fneg afn float [[X]] +; IEEE-BADFREXP-NEXT: [[NEG_AFN_NO_MD:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: store volatile float [[NEG_AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fneg afn float [[X]] +; IEEE-BADFREXP-NEXT: [[NEG_AFN_25ULP:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP24]]) +; IEEE-BADFREXP-NEXT: store volatile float [[NEG_AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = fneg fast float [[X]] +; IEEE-BADFREXP-NEXT: [[NEG_FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-BADFREXP-NEXT: store volatile float [[NEG_FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { @@ -250,17 +376,69 @@ } define amdgpu_kernel void @rcp_fdiv_f32_fpmath_flags(ptr addrspace(1) %out, float %x) { -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_flags -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = fdiv nnan ninf float 1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NINF:%.*]] = fdiv ninf float 1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NNAN:%.*]] = fdiv nnan float 1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NSZ:%.*]] = fdiv nsz float 1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_flags +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP11]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = sub i32 0, [[TMP18]] +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP20]], i32 [[TMP19]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_flags +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = sub i32 0, [[TMP18]] +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP20]], i32 [[TMP19]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_flags ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { @@ -290,15 +468,55 @@ } define amdgpu_kernel void @rcp_fdiv_f32_knownfinite(ptr addrspace(1) %out, -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_knownfinite -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan) [[NO_NAN:%.*]], float nofpclass(nan) [[NO_INF:%.*]], float nofpclass(nan inf) [[NO_INF_NAN:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[MD_1ULP_NO_NAN:%.*]] = fdiv float 1.000000e+00, [[NO_NAN]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_NAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NO_INF:%.*]] = fdiv float 1.000000e+00, [[NO_INF]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_INF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NO_INF_NAN:%.*]] = fdiv float 1.000000e+00, [[NO_INF_NAN]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_INF_NAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_knownfinite +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan) [[NO_NAN:%.*]], float nofpclass(nan) [[NO_INF:%.*]], float nofpclass(nan inf) [[NO_INF_NAN:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_NAN]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_NAN:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_NAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_INF]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_INF:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_INF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_INF_NAN]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP11]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_INF_NAN:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_INF_NAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_knownfinite +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan) [[NO_NAN:%.*]], float nofpclass(nan) [[NO_INF:%.*]], float nofpclass(nan inf) [[NO_INF_NAN:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_NAN]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_NAN]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_NAN:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_NAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_INF]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_INF]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_INF:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_INF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_INF_NAN]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_INF_NAN]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_INF_NAN:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_INF_NAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_knownfinite ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan) [[NO_NAN:%.*]], float nofpclass(nan) [[NO_INF:%.*]], float nofpclass(nan inf) [[NO_INF_NAN:%.*]]) #[[ATTR1]] { @@ -326,13 +544,41 @@ } define amdgpu_kernel void @rcp_fdiv_f32_nozero(ptr addrspace(1) %out, -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nozero -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(zero) [[NO_ZERO:%.*]], float nofpclass(zero sub) [[NO_ZERO_SUB:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[MD_1ULP_NO_ZERO:%.*]] = fdiv float 1.000000e+00, [[NO_ZERO]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_ZERO]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NO_ZERO_SUB:%.*]] = fdiv float 1.000000e+00, [[NO_ZERO_SUB]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_ZERO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nozero +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(zero) [[NO_ZERO:%.*]], float nofpclass(zero sub) [[NO_ZERO_SUB:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_ZERO]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_ZERO:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_ZERO]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_ZERO_SUB]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_ZERO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_ZERO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nozero +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(zero) [[NO_ZERO:%.*]], float nofpclass(zero sub) [[NO_ZERO_SUB:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_ZERO]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_ZERO]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_ZERO:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_ZERO]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_ZERO_SUB]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_ZERO_SUB]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_ZERO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_ZERO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nozero ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(zero) [[NO_ZERO:%.*]], float nofpclass(zero sub) [[NO_ZERO_SUB:%.*]]) #[[ATTR1]] { @@ -353,15 +599,55 @@ } define amdgpu_kernel void @rcp_fdiv_f32_nosub(ptr addrspace(1) %out, -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nosub -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[NO_SUB:%.*]], float nofpclass(nsub) [[NO_NSUB:%.*]], float nofpclass(psub) [[NO_PSUB:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[MD_1ULP_NO_SUB:%.*]] = fdiv float 1.000000e+00, [[NO_SUB]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NO_NSUB:%.*]] = fdiv float 1.000000e+00, [[NO_NSUB]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_NSUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NO_PSUB:%.*]] = fdiv float 1.000000e+00, [[NO_PSUB]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_PSUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nosub +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[NO_SUB:%.*]], float nofpclass(nsub) [[NO_NSUB:%.*]], float nofpclass(psub) [[NO_PSUB:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_SUB]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_NSUB]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_NSUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_NSUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_PSUB]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP11]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_PSUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_PSUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nosub +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[NO_SUB:%.*]], float nofpclass(nsub) [[NO_NSUB:%.*]], float nofpclass(psub) [[NO_PSUB:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_SUB]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_SUB]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_NSUB]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_NSUB]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_NSUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_NSUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[NO_PSUB]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[NO_PSUB]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_PSUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_PSUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_nosub ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[NO_SUB:%.*]], float nofpclass(nsub) [[NO_NSUB:%.*]], float nofpclass(psub) [[NO_PSUB:%.*]]) #[[ATTR1]] { @@ -389,14 +675,33 @@ } define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub(ptr addrspace(1) %out, float %x) { -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) -; IEEE-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 -; IEEE-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) -; IEEE-NEXT: [[MD_1ULP_NO_SUB:%.*]] = fdiv float 1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { @@ -417,17 +722,39 @@ ; Test if we have an assumption on the output that it's not denormal. define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub_assume_result_nosub(ptr addrspace(1) %out, float %x) { -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub_assume_result_nosub -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) -; IEEE-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 -; IEEE-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) -; IEEE-NEXT: [[MD_1ULP_NO_SUB:%.*]] = fdiv float 1.000000e+00, [[X]], !fpmath !2 -; IEEE-NEXT: [[FABS_RESULT:%.*]] = call float @llvm.fabs.f32(float [[MD_1ULP_NO_SUB]]) -; IEEE-NEXT: [[RESULT_IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_RESULT]], 0x3810000000000000 -; IEEE-NEXT: call void @llvm.assume(i1 [[RESULT_IS_NOT_SUBNORMAL]]) -; IEEE-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub_assume_result_nosub +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[FABS_RESULT:%.*]] = call float @llvm.fabs.f32(float [[MD_1ULP_NO_SUB]]) +; IEEE-GOODFREXP-NEXT: [[RESULT_IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_RESULT]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: call void @llvm.assume(i1 [[RESULT_IS_NOT_SUBNORMAL]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub_assume_result_nosub +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NO_SUB:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[FABS_RESULT:%.*]] = call float @llvm.fabs.f32(float [[MD_1ULP_NO_SUB]]) +; IEEE-BADFREXP-NEXT: [[RESULT_IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_RESULT]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: call void @llvm.assume(i1 [[RESULT_IS_NOT_SUBNORMAL]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_assume_nosub_assume_result_nosub ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { @@ -454,37 +781,149 @@ } define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags(ptr addrspace(1) %out, <2 x float> %x) { -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv nnan ninf float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv nnan ninf float 1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP7:%.*]] = fdiv ninf float 1.000000e+00, [[TMP6]] -; IEEE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP10:%.*]] = fdiv ninf float 1.000000e+00, [[TMP9]] -; IEEE-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP12:%.*]] = fdiv nnan float 1.000000e+00, [[TMP11]] -; IEEE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; IEEE-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP15:%.*]] = fdiv nnan float 1.000000e+00, [[TMP14]] -; IEEE-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP15]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP17:%.*]] = fdiv nsz float 1.000000e+00, [[TMP16]] -; IEEE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 -; IEEE-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP20:%.*]] = fdiv nsz float 1.000000e+00, [[TMP19]] -; IEEE-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP20]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP17]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = sub i32 0, [[TMP19]] +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP18]]) +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP21]], i32 [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP24]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP30]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP39]]) +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = sub i32 0, [[TMP42]] +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP41]]) +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP44]], i32 [[TMP43]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP45]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP46]]) +; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = extractvalue { float, i32 } [[TMP47]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP47]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = sub i32 0, [[TMP49]] +; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP48]]) +; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP50]]) +; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = insertelement <2 x float> poison, float [[TMP52]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP54:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP55:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP54]]) +; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = extractvalue { float, i32 } [[TMP55]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = extractvalue { float, i32 } [[TMP55]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP58:%.*]] = sub i32 0, [[TMP57]] +; IEEE-GOODFREXP-NEXT: [[TMP59:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP56]]) +; IEEE-GOODFREXP-NEXT: [[TMP60:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP59]], i32 [[TMP58]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP60]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = sub i32 0, [[TMP19]] +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP21]], i32 [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP24]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP24]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP30]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP39]]) +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP39]]) +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = sub i32 0, [[TMP42]] +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP41]]) +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP44]], i32 [[TMP43]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP45]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP46]]) +; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = extractvalue { float, i32 } [[TMP47]], 0 +; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP46]]) +; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = sub i32 0, [[TMP49]] +; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP48]]) +; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP50]]) +; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = insertelement <2 x float> poison, float [[TMP52]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP54:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP55:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP54]]) +; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = extractvalue { float, i32 } [[TMP55]], 0 +; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP54]]) +; IEEE-BADFREXP-NEXT: [[TMP58:%.*]] = sub i32 0, [[TMP57]] +; IEEE-BADFREXP-NEXT: [[TMP59:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP56]]) +; IEEE-BADFREXP-NEXT: [[TMP60:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP59]], i32 [[TMP58]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP60]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { @@ -547,29 +986,29 @@ ; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 ; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP10:%.*]] = fdiv float [[TMP8]], [[TMP9]] +; IEEE-NEXT: [[TMP10:%.*]] = fdiv float [[TMP8]], [[TMP9]], !fpmath !1 ; IEEE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 ; IEEE-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[A]], i64 1 ; IEEE-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP14:%.*]] = fdiv float [[TMP12]], [[TMP13]] +; IEEE-NEXT: [[TMP14:%.*]] = fdiv float [[TMP12]], [[TMP13]], !fpmath !1 ; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP14]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[A]], i64 0 ; IEEE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP17:%.*]] = fdiv float [[TMP15]], [[TMP16]] +; IEEE-NEXT: [[TMP17:%.*]] = fdiv float [[TMP15]], [[TMP16]], !fpmath !2 ; IEEE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 ; IEEE-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[A]], i64 1 ; IEEE-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP21:%.*]] = fdiv float [[TMP19]], [[TMP20]] +; IEEE-NEXT: [[TMP21:%.*]] = fdiv float [[TMP19]], [[TMP20]], !fpmath !2 ; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP21]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 0 ; IEEE-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP24:%.*]] = fdiv float [[TMP22]], [[TMP23]] +; IEEE-NEXT: [[TMP24:%.*]] = fdiv float [[TMP22]], [[TMP23]], !fpmath !0 ; IEEE-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP24]], i64 0 ; IEEE-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[A]], i64 1 ; IEEE-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP28:%.*]] = fdiv float [[TMP26]], [[TMP27]] +; IEEE-NEXT: [[TMP28:%.*]] = fdiv float [[TMP26]], [[TMP27]], !fpmath !0 ; IEEE-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP28]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-NEXT: ret void @@ -587,20 +1026,20 @@ ; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 ; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[B]], i64 0 -; DAZ-NEXT: [[TMP10:%.*]] = fdiv float [[TMP8]], [[TMP9]] +; DAZ-NEXT: [[TMP10:%.*]] = fdiv float [[TMP8]], [[TMP9]], !fpmath !1 ; DAZ-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 ; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[A]], i64 1 ; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP14:%.*]] = fdiv float [[TMP12]], [[TMP13]] +; DAZ-NEXT: [[TMP14:%.*]] = fdiv float [[TMP12]], [[TMP13]], !fpmath !1 ; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP14]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[A]], i64 0 ; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[B]], i64 0 -; DAZ-NEXT: [[TMP17:%.*]] = fdiv float [[TMP15]], [[TMP16]] +; DAZ-NEXT: [[TMP17:%.*]] = fdiv float [[TMP15]], [[TMP16]], !fpmath !2 ; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 ; DAZ-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[A]], i64 1 ; DAZ-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP21:%.*]] = fdiv float [[TMP19]], [[TMP20]] +; DAZ-NEXT: [[TMP21:%.*]] = fdiv float [[TMP19]], [[TMP20]], !fpmath !2 ; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP21]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 0 @@ -636,10 +1075,10 @@ ; CHECK-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 ; CHECK-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = fdiv float 1.000000e+00, [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv float 1.000000e+00, [[TMP6]], !fpmath !1 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP10:%.*]] = fdiv float 1.000000e+00, [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = fdiv float 1.000000e+00, [[TMP9]], !fpmath !1 ; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 ; CHECK-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -688,48 +1127,87 @@ } define amdgpu_kernel void @rcp_fdiv_f32_fpmath_vector_nonsplat(ptr addrspace(1) %out, <2 x float> %x) { -; CHECK-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_vector_nonsplat -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = fdiv float 1.000000e+00, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fdiv float 2.000000e+00, [[TMP4]] -; CHECK-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP10:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP9]]) -; CHECK-NEXT: [[TMP11:%.*]] = fmul afn float 2.000000e+00, [[TMP10]] -; CHECK-NEXT: [[AFN_NO_MD:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP11]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP15]]) -; CHECK-NEXT: [[TMP17:%.*]] = fmul fast float 2.000000e+00, [[TMP16]] -; CHECK-NEXT: [[FAST_NO_MD:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP17]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP18]]) -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x float> poison, float [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP22:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP21]]) -; CHECK-NEXT: [[TMP23:%.*]] = fmul afn float 2.000000e+00, [[TMP22]] -; CHECK-NEXT: [[AFN_25ULP:%.*]] = insertelement <2 x float> [[TMP20]], float [[TMP23]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP24]]) -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x float> poison, float [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP28:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP27]]) -; CHECK-NEXT: [[TMP29:%.*]] = fmul fast float 2.000000e+00, [[TMP28]] -; CHECK-NEXT: [[FAST_25ULP:%.*]] = insertelement <2 x float> [[TMP26]], float [[TMP29]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: ret void +; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_vector_nonsplat +; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = fdiv float 1.000000e+00, [[TMP1]] +; IEEE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP5:%.*]] = fdiv float 2.000000e+00, [[TMP4]] +; IEEE-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP7:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 +; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP10:%.*]] = fdiv afn float 2.000000e+00, [[TMP9]] +; IEEE-NEXT: [[AFN_NO_MD:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; IEEE-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP15:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-NEXT: [[TMP16:%.*]] = fmul fast float 2.000000e+00, [[TMP15]] +; IEEE-NEXT: [[FAST_NO_MD:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP16]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP18:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; IEEE-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP18]], i64 0 +; IEEE-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP21:%.*]] = fdiv afn float 2.000000e+00, [[TMP20]], !fpmath !0 +; IEEE-NEXT: [[AFN_25ULP:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP21]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP23:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP22]]) +; IEEE-NEXT: [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i64 0 +; IEEE-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP26:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-NEXT: [[TMP27:%.*]] = fmul fast float 2.000000e+00, [[TMP26]] +; IEEE-NEXT: [[FAST_25ULP:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP27]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_vector_nonsplat +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = fdiv float 1.000000e+00, [[TMP1]] +; DAZ-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = fdiv float 2.000000e+00, [[TMP4]] +; DAZ-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP7:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = fdiv afn float 2.000000e+00, [[TMP9]] +; DAZ-NEXT: [[AFN_NO_MD:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; DAZ-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP15:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; DAZ-NEXT: [[TMP16:%.*]] = fmul fast float 2.000000e+00, [[TMP15]] +; DAZ-NEXT: [[FAST_NO_MD:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP16]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP18:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP18]], i64 0 +; DAZ-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP21:%.*]] = call afn float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float [[TMP20]]) +; DAZ-NEXT: [[AFN_25ULP:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP21]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP23:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP22]]) +; DAZ-NEXT: [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i64 0 +; DAZ-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP26:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; DAZ-NEXT: [[TMP27:%.*]] = fmul fast float 2.000000e+00, [[TMP26]] +; DAZ-NEXT: [[FAST_25ULP:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP27]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: ret void ; %no.md = fdiv <2 x float> , %x store volatile <2 x float> %no.md, ptr addrspace(1) %out, align 8 @@ -745,67 +1223,130 @@ } define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant(ptr addrspace(1) %out, <2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = fmul afn float [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = fmul afn float [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[AFN_25ULP:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP9]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast float [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = fmul fast float [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[FAST_25ULP:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP18]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: ret void -; - %x.insert = insertelement <2 x float> %x, float 1.000000e+00, i32 0 - %afn.25ulp = fdiv afn <2 x float> %x.insert, %y, !fpmath !0 - store volatile <2 x float> %afn.25ulp, ptr addrspace(1) %out, align 8 - %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 - store volatile <2 x float> %fast.25ulp, ptr addrspace(1) %out, align 8 - ret void -} - -define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant_arcp(ptr addrspace(1) %out, <2 x float> %x, <2 x float> %y) { -; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant_arcp +; IEEE-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { ; IEEE-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 ; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 ; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-NEXT: [[TMP3:%.*]] = fdiv arcp float [[TMP1]], [[TMP2]] +; IEEE-NEXT: [[TMP3:%.*]] = fdiv afn float [[TMP1]], [[TMP2]], !fpmath !0 ; IEEE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 ; IEEE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 ; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = fdiv arcp float [[TMP5]], [[TMP6]] -; IEEE-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[ARCP_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-NEXT: [[TMP7:%.*]] = fdiv afn float [[TMP5]], [[TMP6]], !fpmath !0 +; IEEE-NEXT: [[AFN_25ULP:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 +; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] +; IEEE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; IEEE-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; IEEE-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-NEXT: [[TMP15:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-NEXT: [[TMP16:%.*]] = fmul fast float [[TMP13]], [[TMP15]] +; IEEE-NEXT: [[FAST_25ULP:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP16]], i64 1 +; IEEE-NEXT: store volatile <2 x float> [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-NEXT: ret void ; -; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant_arcp +; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 ; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[TMP1]], float [[TMP2]]) +; DAZ-NEXT: [[TMP3:%.*]] = call afn float @llvm.amdgcn.fdiv.fast(float [[TMP1]], float [[TMP2]]) ; DAZ-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 ; DAZ-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 ; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[TMP5]], float [[TMP6]]) -; DAZ-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = call afn float @llvm.amdgcn.fdiv.fast(float [[TMP5]], float [[TMP6]]) +; DAZ-NEXT: [[AFN_25ULP:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; DAZ-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; DAZ-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] +; DAZ-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP15:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; DAZ-NEXT: [[TMP16:%.*]] = fmul fast float [[TMP13]], [[TMP15]] +; DAZ-NEXT: [[FAST_25ULP:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP16]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: ret void +; + %x.insert = insertelement <2 x float> %x, float 1.000000e+00, i32 0 + %afn.25ulp = fdiv afn <2 x float> %x.insert, %y, !fpmath !0 + store volatile <2 x float> %afn.25ulp, ptr addrspace(1) %out, align 8 + %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 + store volatile <2 x float> %fast.25ulp, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant_arcp(ptr addrspace(1) %out, <2 x float> %x, <2 x float> %y) { +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant_arcp +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] +; IEEE-GOODFREXP-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant_arcp +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] +; IEEE-BADFREXP-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_partial_constant_arcp +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP4:%.*]] = fmul arcp float [[TMP1]], [[TMP3]] +; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; DAZ-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP6]], [[TMP8]] +; DAZ-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP9]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[ARCP_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void ; @@ -816,89 +1357,190 @@ } define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) { -; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_fpmath -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]) -; IEEE-NEXT: [[NO_MD:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_X_NO_MD]] -; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; IEEE-NEXT: [[MD_1ULP:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_MD_1ULP]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; IEEE-NEXT: store volatile float [[SQRT_MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_MULTI_USE:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_MD_1ULP_MULTI_USE]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MD_25ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; IEEE-NEXT: [[MD_25ULP:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_MD_25ULP]]) -; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MD_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 -; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_MD_HALF_ULP]], !fpmath !1 -; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]) -; IEEE-NEXT: [[AFN_NO_MD:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[SQRT_X_AFN_NO_MD]]) -; IEEE-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; IEEE-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[SQRT_X_AFN_25ULP]]) -; IEEE-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]) -; IEEE-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[SQRT_X_FAST_NO_MD]]) -; IEEE-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; IEEE-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[SQRT_X_FAST_25ULP]]) -; IEEE-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP]]) -; IEEE-NEXT: store volatile float [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[NEG_SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[NEG_FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float -1.000000e+00, float [[NEG_SQRT_X_3ULP]]) -; IEEE-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 -; IEEE-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_HALF_ULP]]) -; IEEE-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) -; IEEE-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_MISMATCH_MD1]]) -; IEEE-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD2]]) -; IEEE-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD2]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rsq_f32_fpmath +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[NO_MD:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_X_NO_MD]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = select contract i1 [[TMP1]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = fmul contract float [[X]], [[TMP2]] +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = select contract i1 [[TMP1]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = fmul contract float [[TMP4]], [[TMP5]] +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[SQRT_MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-GOODFREXP-NEXT: store volatile float [[SQRT_MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MD_1ULP_MULTI_USE]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP6]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = fmul contract float [[X]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = select contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = fmul contract float [[TMP14]], [[TMP15]] +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[SQRT_MD_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_MD_HALF_ULP]], !fpmath !1 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[AFN_NO_MD:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP16]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fmul contract float [[X]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP18]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = select contract i1 [[TMP16]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[FDIV_OPENCL:%.*]] = fmul contract float [[TMP19]], [[TMP20]] +; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = select contract i1 [[TMP21]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = fmul contract float [[X]], [[TMP22]] +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]]) +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]] +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]]) +; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = fcmp contract afn olt float [[X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = select contract afn i1 [[TMP36]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul contract afn float [[X]], [[TMP37]] +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP38]]) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = select contract afn i1 [[TMP36]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = fmul contract afn float [[TMP39]], [[TMP40]] +; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD2]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rsq_f32_fpmath +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[NO_MD:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_X_NO_MD]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = select contract i1 [[TMP1]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = fmul contract float [[X]], [[TMP2]] +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = select contract i1 [[TMP1]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = fmul contract float [[TMP4]], [[TMP5]] +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[SQRT_MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-BADFREXP-NEXT: store volatile float [[SQRT_MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MD_1ULP_MULTI_USE]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP6]], 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_MD_1ULP_MULTI_USE]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 0, [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = fmul contract float [[X]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = select contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = fmul contract float [[TMP14]], [[TMP15]] +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[SQRT_MD_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_MD_HALF_ULP]], !fpmath !1 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[AFN_NO_MD:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP16]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fmul contract float [[X]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = select contract i1 [[TMP16]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[FDIV_OPENCL:%.*]] = fmul contract float [[TMP19]], [[TMP20]] +; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = select contract i1 [[TMP21]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = fmul contract float [[X]], [[TMP22]] +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00 +; IEEE-BADFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]] +; IEEE-BADFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_X_HALF_ULP]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]] +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]]) +; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_MISMATCH_MD1]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) +; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = fcmp contract afn olt float [[X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = select contract afn i1 [[TMP36]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul contract afn float [[X]], [[TMP37]] +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP38]]) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = select contract afn i1 [[TMP36]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = fmul contract afn float [[TMP39]], [[TMP40]] +; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD2]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_fpmath ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[NO_MD:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_X_NO_MD]] ; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; DAZ-NEXT: [[MD_1ULP:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MD_1ULP]]) +; DAZ-NEXT: [[MD_1ULP:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 ; DAZ-NEXT: store volatile float [[SQRT_MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_1ULP_MULTI_USE:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MD_1ULP_MULTI_USE]]) ; DAZ-NEXT: store volatile float [[MD_1ULP_MULTI_USE]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MD_25ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; DAZ-NEXT: [[MD_25ULP:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MD_25ULP]]) +; DAZ-NEXT: [[MD_25ULP:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_MD_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 ; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_MD_HALF_ULP]], !fpmath !1 ; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]) -; DAZ-NEXT: [[AFN_NO_MD:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[SQRT_X_AFN_NO_MD]]) +; DAZ-NEXT: [[AFN_NO_MD:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; DAZ-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[SQRT_X_AFN_25ULP]]) +; DAZ-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]) -; DAZ-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[SQRT_X_FAST_NO_MD]]) +; DAZ-NEXT: [[FAST_NO_MD:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; DAZ-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[SQRT_X_FAST_25ULP]]) +; DAZ-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP]]) +; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[NEG_SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[TMP1:%.*]] = fneg contract float [[NEG_SQRT_X_3ULP]] -; DAZ-NEXT: [[NEG_FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) +; DAZ-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fneg contract float [[TMP1]] ; DAZ-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !1 ; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_HALF_ULP]]) @@ -906,8 +1548,7 @@ ; DAZ-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD1]]) ; DAZ-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD1]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MISMATCH_MD2:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD2]]) +; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD2:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD2]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -982,45 +1623,63 @@ define amdgpu_kernel void @rsq_f32_fpmath_flags(ptr addrspace(1) %out, float %x) { ; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_fpmath_flags ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_3ULP_NINF_NNAN:%.*]] = call nnan ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NINF_NNAN:%.*]] = call nnan ninf contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NINF_NNAN]]) +; IEEE-NEXT: [[TMP1:%.*]] = fcmp nnan ninf contract olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select nnan ninf contract i1 [[TMP1]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP3:%.*]] = fmul nnan ninf contract float [[X]], [[TMP2]] +; IEEE-NEXT: [[TMP4:%.*]] = call nnan ninf contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select nnan ninf contract i1 [[TMP1]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NINF_NNAN:%.*]] = fmul nnan ninf contract float [[TMP4]], [[TMP5]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NINF:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NINF:%.*]] = call ninf contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NINF]]) +; IEEE-NEXT: [[TMP6:%.*]] = fcmp ninf contract olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select ninf contract i1 [[TMP6]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP8:%.*]] = fmul ninf contract float [[X]], [[TMP7]] +; IEEE-NEXT: [[TMP9:%.*]] = call ninf contract float @llvm.amdgcn.rsq.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select ninf contract i1 [[TMP6]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NINF:%.*]] = fmul ninf contract float [[TMP9]], [[TMP10]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NINF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NNAN:%.*]] = call nnan contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NNAN:%.*]] = call nnan contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NNAN]]) +; IEEE-NEXT: [[TMP11:%.*]] = fcmp nnan contract olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select nnan contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP13:%.*]] = fmul nnan contract float [[X]], [[TMP12]] +; IEEE-NEXT: [[TMP14:%.*]] = call nnan contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select nnan contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NNAN:%.*]] = fmul nnan contract float [[TMP14]], [[TMP15]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NSZ:%.*]] = call nsz contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NSZ:%.*]] = call nsz contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NSZ]]) +; IEEE-NEXT: [[TMP16:%.*]] = fcmp nsz contract olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP17:%.*]] = select nsz contract i1 [[TMP16]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP18:%.*]] = fmul nsz contract float [[X]], [[TMP17]] +; IEEE-NEXT: [[TMP19:%.*]] = call nsz contract float @llvm.amdgcn.rsq.f32(float [[TMP18]]) +; IEEE-NEXT: [[TMP20:%.*]] = select nsz contract i1 [[TMP16]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NSZ:%.*]] = fmul nsz contract float [[TMP19]], [[TMP20]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NSZ]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NINF_MIX0:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NNAN_MIX0:%.*]] = call nnan contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NINF_MIX0]]) +; IEEE-NEXT: [[TMP21:%.*]] = fcmp nnan ninf contract olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP22:%.*]] = select nnan ninf contract i1 [[TMP21]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP23:%.*]] = fmul nnan ninf contract float [[X]], [[TMP22]] +; IEEE-NEXT: [[TMP24:%.*]] = call nnan ninf contract float @llvm.amdgcn.rsq.f32(float [[TMP23]]) +; IEEE-NEXT: [[TMP25:%.*]] = select nnan ninf contract i1 [[TMP21]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NNAN_MIX0:%.*]] = fmul nnan ninf contract float [[TMP24]], [[TMP25]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NNAN_MIX0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NINF_MIX1:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NNAN_MIX1:%.*]] = call nnan contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NINF_MIX1]]) +; IEEE-NEXT: [[TMP26:%.*]] = fcmp nnan ninf contract olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP27:%.*]] = select nnan ninf contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP28:%.*]] = fmul nnan ninf contract float [[X]], [[TMP27]] +; IEEE-NEXT: [[TMP29:%.*]] = call nnan ninf contract float @llvm.amdgcn.rsq.f32(float [[TMP28]]) +; IEEE-NEXT: [[TMP30:%.*]] = select nnan ninf contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NNAN_MIX1:%.*]] = fmul nnan ninf contract float [[TMP29]], [[TMP30]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NNAN_MIX1]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_fpmath_flags ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT_X_3ULP_NINF_NNAN:%.*]] = call nnan ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NINF_NNAN:%.*]] = call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NINF_NNAN]]) +; DAZ-NEXT: [[FDIV_OPENCL_NINF_NNAN:%.*]] = call nnan ninf contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NINF:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NINF:%.*]] = call ninf contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NINF]]) +; DAZ-NEXT: [[FDIV_OPENCL_NINF:%.*]] = call ninf contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NINF]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NNAN:%.*]] = call nnan contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NNAN:%.*]] = call nnan contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NNAN]]) +; DAZ-NEXT: [[FDIV_OPENCL_NNAN:%.*]] = call nnan contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NSZ:%.*]] = call nsz contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NSZ:%.*]] = call nsz contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NSZ]]) +; DAZ-NEXT: [[FDIV_OPENCL_NSZ:%.*]] = call nsz contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NSZ]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NINF_MIX0:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NNAN_MIX0:%.*]] = call nnan contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NINF_MIX0]]) +; DAZ-NEXT: [[FDIV_OPENCL_NNAN_MIX0:%.*]] = call nnan ninf contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NNAN_MIX0]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NINF_MIX1:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NNAN_MIX1:%.*]] = call nnan contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NINF_MIX1]]) +; DAZ-NEXT: [[FDIV_OPENCL_NNAN_MIX1:%.*]] = call nnan ninf contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NNAN_MIX1]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -1052,11 +1711,27 @@ } define float @rsq_f32_missing_contract0(float %x) { -; IEEE-LABEL: define float @rsq_f32_missing_contract0 -; IEEE-SAME: (float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = fdiv contract float 1.000000e+00, [[SQRT_X_3ULP]], !fpmath !2 -; IEEE-NEXT: ret float [[FDIV_OPENCL]] +; IEEE-GOODFREXP-LABEL: define float @rsq_f32_missing_contract0 +; IEEE-GOODFREXP-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_3ULP]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: ret float [[FDIV_OPENCL]] +; +; IEEE-BADFREXP-LABEL: define float @rsq_f32_missing_contract0 +; IEEE-BADFREXP-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_3ULP]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_X_3ULP]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: ret float [[FDIV_OPENCL]] ; ; DAZ-LABEL: define float @rsq_f32_missing_contract0 ; DAZ-SAME: (float [[X:%.*]]) #[[ATTR1]] { @@ -1070,11 +1745,27 @@ } define float @rsq_f32_missing_contract1(float %x) { -; IEEE-LABEL: define float @rsq_f32_missing_contract1 -; IEEE-SAME: (float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = fdiv float 1.000000e+00, [[SQRT_X_3ULP]], !fpmath !2 -; IEEE-NEXT: ret float [[FDIV_OPENCL]] +; IEEE-GOODFREXP-LABEL: define float @rsq_f32_missing_contract1 +; IEEE-GOODFREXP-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_3ULP]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[FDIV_OPENCL:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: ret float [[FDIV_OPENCL]] +; +; IEEE-BADFREXP-LABEL: define float @rsq_f32_missing_contract1 +; IEEE-BADFREXP-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_3ULP]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_X_3ULP]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[FDIV_OPENCL:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: ret float [[FDIV_OPENCL]] ; ; DAZ-LABEL: define float @rsq_f32_missing_contract1 ; DAZ-SAME: (float [[X:%.*]]) #[[ATTR1]] { @@ -1090,14 +1781,17 @@ define float @rsq_f32_flag_merge(float %x) { ; IEEE-LABEL: define float @rsq_f32_flag_merge ; IEEE-SAME: (float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_3ULP:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = fdiv nsz contract float 1.000000e+00, [[SQRT_X_3ULP]], !fpmath !2 +; IEEE-NEXT: [[TMP1:%.*]] = fcmp ninf nsz contract olt float [[X]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select ninf nsz contract i1 [[TMP1]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP3:%.*]] = fmul ninf nsz contract float [[X]], [[TMP2]] +; IEEE-NEXT: [[TMP4:%.*]] = call ninf nsz contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select ninf nsz contract i1 [[TMP1]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = fmul ninf nsz contract float [[TMP4]], [[TMP5]] ; IEEE-NEXT: ret float [[FDIV_OPENCL]] ; ; DAZ-LABEL: define float @rsq_f32_flag_merge ; DAZ-SAME: (float [[X:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call ninf contract float @llvm.sqrt.f32(float [[X]]), !fpmath !2 -; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call nsz contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP]]) +; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call ninf nsz contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: ret float [[FDIV_OPENCL]] ; %sqrt.x.3ulp = call contract ninf float @llvm.sqrt.f32(float %x), !fpmath !2 @@ -1108,27 +1802,36 @@ define amdgpu_kernel void @rsq_f32_knownfinite(ptr addrspace(1) %out, float nofpclass(nan) %no.nan, ; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_knownfinite ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan) [[NO_NAN:%.*]], float nofpclass(nan) [[NO_INF:%.*]], float nofpclass(nan inf) [[NO_INF_NAN:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_3ULP_NO_NAN:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_NAN]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_NAN:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_NAN]]) +; IEEE-NEXT: [[TMP1:%.*]] = fcmp contract olt float [[NO_NAN]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select contract i1 [[TMP1]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP3:%.*]] = fmul contract float [[NO_NAN]], [[TMP2]] +; IEEE-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select contract i1 [[TMP1]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NO_NAN:%.*]] = fmul contract float [[TMP4]], [[TMP5]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_NAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NO_INF:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_INF]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_INF:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_INF]]) +; IEEE-NEXT: [[TMP6:%.*]] = fcmp contract olt float [[NO_INF]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select contract i1 [[TMP6]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP8:%.*]] = fmul contract float [[NO_INF]], [[TMP7]] +; IEEE-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP6]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NO_INF:%.*]] = fmul contract float [[TMP9]], [[TMP10]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_INF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NO_INF_NAN:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_INF_NAN]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_INF_NAN:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_INF_NAN]]) +; IEEE-NEXT: [[TMP11:%.*]] = fcmp contract olt float [[NO_INF_NAN]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP13:%.*]] = fmul contract float [[NO_INF_NAN]], [[TMP12]] +; IEEE-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NO_INF_NAN:%.*]] = fmul contract float [[TMP14]], [[TMP15]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_INF_NAN]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_knownfinite ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan) [[NO_NAN:%.*]], float nofpclass(nan) [[NO_INF:%.*]], float nofpclass(nan inf) [[NO_INF_NAN:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT_X_3ULP_NO_NAN:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_NAN]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_NAN:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_NAN]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_NAN:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_NAN]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_NAN]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NO_INF:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_INF]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_INF:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_INF]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_INF:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_INF]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_INF]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NO_INF_NAN:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_INF_NAN]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_INF_NAN:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_INF_NAN]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_INF_NAN:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_INF_NAN]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_INF_NAN]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -1152,21 +1855,22 @@ define amdgpu_kernel void @rsq_f32_known_nozero(ptr addrspace(1) %out, float nofpclass(zero) %no.zero, float nofpclass(zero sub) %no.zero.sub) { ; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_known_nozero ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(zero) [[NO_ZERO:%.*]], float nofpclass(zero sub) [[NO_ZERO_SUB:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_3ULP_NO_ZERO:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_ZERO]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_ZERO:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_ZERO]]) +; IEEE-NEXT: [[TMP1:%.*]] = fcmp contract olt float [[NO_ZERO]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select contract i1 [[TMP1]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP3:%.*]] = fmul contract float [[NO_ZERO]], [[TMP2]] +; IEEE-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select contract i1 [[TMP1]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NO_ZERO:%.*]] = fmul contract float [[TMP4]], [[TMP5]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_ZERO]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NO_ZERO_SUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_ZERO_SUB]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_ZERO_SUB:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_ZERO_SUB]]) +; IEEE-NEXT: [[FDIV_OPENCL_NO_ZERO_SUB:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_ZERO_SUB]]) ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_ZERO_SUB]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_known_nozero ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(zero) [[NO_ZERO:%.*]], float nofpclass(zero sub) [[NO_ZERO_SUB:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT_X_3ULP_NO_ZERO:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_ZERO]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_ZERO:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_ZERO]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_ZERO:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_ZERO]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_ZERO]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NO_ZERO_SUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_ZERO_SUB]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_ZERO_SUB:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_ZERO_SUB]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_ZERO_SUB:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_ZERO_SUB]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_ZERO_SUB]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -1184,27 +1888,31 @@ define amdgpu_kernel void @rsq_f32_known_nosub(ptr addrspace(1) %out, float nofpclass(sub) %no.sub, float nofpclass(psub) %no.psub, float nofpclass(nsub) %no.nsub) { ; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_known_nosub ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[NO_SUB:%.*]], float nofpclass(psub) [[NO_PSUB:%.*]], float nofpclass(nsub) [[NO_NSUB:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_3ULP_NO_SUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_SUB]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_SUB:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_SUB]]) +; IEEE-NEXT: [[FDIV_OPENCL_NO_SUB:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_SUB]]) ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NO_PSUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_PSUB]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_PSUB:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_PSUB]]) +; IEEE-NEXT: [[TMP1:%.*]] = fcmp contract olt float [[NO_PSUB]], 0x3810000000000000 +; IEEE-NEXT: [[TMP2:%.*]] = select contract i1 [[TMP1]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP3:%.*]] = fmul contract float [[NO_PSUB]], [[TMP2]] +; IEEE-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; IEEE-NEXT: [[TMP5:%.*]] = select contract i1 [[TMP1]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NO_PSUB:%.*]] = fmul contract float [[TMP4]], [[TMP5]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_PSUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP_NO_NSUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_NSUB]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_NSUB:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_NSUB]]) +; IEEE-NEXT: [[TMP6:%.*]] = fcmp contract olt float [[NO_NSUB]], 0x3810000000000000 +; IEEE-NEXT: [[TMP7:%.*]] = select contract i1 [[TMP6]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP8:%.*]] = fmul contract float [[NO_NSUB]], [[TMP7]] +; IEEE-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP8]]) +; IEEE-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP6]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[FDIV_OPENCL_NO_NSUB:%.*]] = fmul contract float [[TMP9]], [[TMP10]] ; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_NSUB]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_known_nosub ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[NO_SUB:%.*]], float nofpclass(psub) [[NO_PSUB:%.*]], float nofpclass(nsub) [[NO_NSUB:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT_X_3ULP_NO_SUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_SUB]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_SUB:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_SUB]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_SUB:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_SUB]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NO_PSUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_PSUB]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_PSUB:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_PSUB]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_PSUB:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_PSUB]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_PSUB]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP_NO_NSUB:%.*]] = call contract float @llvm.sqrt.f32(float [[NO_NSUB]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_NSUB:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_NSUB]]) +; DAZ-NEXT: [[FDIV_OPENCL_NO_NSUB:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[NO_NSUB]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_NSUB]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -1224,25 +1932,14 @@ } define amdgpu_kernel void @rsq_f32_assume_nosub(ptr addrspace(1) %out, float %x) { -; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_assume_nosub -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) -; IEEE-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 -; IEEE-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) -; IEEE-NEXT: [[SQRT_X_3ULP_NO_SUB:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; IEEE-NEXT: [[FDIV_OPENCL_NO_SUB:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[SQRT_X_3ULP_NO_SUB]]) -; IEEE-NEXT: store volatile float [[FDIV_OPENCL_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void -; -; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_assume_nosub -; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) -; DAZ-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 -; DAZ-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) -; DAZ-NEXT: [[SQRT_X_3ULP_NO_SUB:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath !3 -; DAZ-NEXT: [[FDIV_OPENCL_NO_SUB:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_3ULP_NO_SUB]]) -; DAZ-NEXT: store volatile float [[FDIV_OPENCL_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: ret void +; CHECK-LABEL: define amdgpu_kernel void @rsq_f32_assume_nosub +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000 +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]]) +; CHECK-NEXT: [[FDIV_OPENCL_NO_SUB:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) +; CHECK-NEXT: store volatile float [[FDIV_OPENCL_NO_SUB]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void ; %fabs.x = call float @llvm.fabs.f32(float %x) %is.not.subnormal = fcmp oge float %fabs.x, 0x3810000000000000 @@ -1264,29 +1961,52 @@ ; IEEE-NEXT: [[TMP5:%.*]] = fdiv contract float 1.000000e+00, [[TMP4]] ; IEEE-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 -; IEEE-NEXT: [[TMP7:%.*]] = fdiv contract float 1.000000e+00, [[TMP6]] -; IEEE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 -; IEEE-NEXT: [[TMP10:%.*]] = fdiv contract float 1.000000e+00, [[TMP9]] -; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 +; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP7:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-NEXT: [[TMP8:%.*]] = select contract i1 [[TMP7]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP9:%.*]] = fmul contract float [[TMP6]], [[TMP8]] +; IEEE-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP9]]) +; IEEE-NEXT: [[TMP11:%.*]] = select contract i1 [[TMP7]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP12:%.*]] = fmul contract float [[TMP10]], [[TMP11]] +; IEEE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; IEEE-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP14]], 0x3810000000000000 +; IEEE-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP14]], [[TMP16]] +; IEEE-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; IEEE-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] +; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP20]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; IEEE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 -; IEEE-NEXT: [[TMP12:%.*]] = fdiv contract float 1.000000e+00, [[TMP11]] -; IEEE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; IEEE-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; IEEE-NEXT: [[TMP15:%.*]] = fdiv contract float undef, [[TMP14]] -; IEEE-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP15]], i64 1 +; IEEE-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP21]], 0x3810000000000000 +; IEEE-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP22]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP21]], [[TMP23]] +; IEEE-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP24]]) +; IEEE-NEXT: [[TMP26:%.*]] = select contract i1 [[TMP22]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP27:%.*]] = fmul contract float [[TMP25]], [[TMP26]] +; IEEE-NEXT: [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 +; IEEE-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; IEEE-NEXT: [[TMP30:%.*]] = fdiv contract float undef, [[TMP29]], !fpmath !2 +; IEEE-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP28]], float [[TMP30]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; IEEE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 -; IEEE-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[TMP16]]) -; IEEE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 -; IEEE-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 -; IEEE-NEXT: [[TMP20:%.*]] = call contract float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float [[TMP19]]) -; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP20]], i64 1 +; IEEE-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP32:%.*]] = fcmp contract olt float [[TMP31]], 0x3810000000000000 +; IEEE-NEXT: [[TMP33:%.*]] = select contract i1 [[TMP32]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP34:%.*]] = fmul contract float [[TMP31]], [[TMP33]] +; IEEE-NEXT: [[TMP35:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP34]]) +; IEEE-NEXT: [[TMP36:%.*]] = select contract i1 [[TMP32]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP37:%.*]] = fmul contract float [[TMP35]], [[TMP36]] +; IEEE-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; IEEE-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP40:%.*]] = fcmp contract olt float [[TMP39]], 0x3810000000000000 +; IEEE-NEXT: [[TMP41:%.*]] = select contract i1 [[TMP40]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP39]], [[TMP41]] +; IEEE-NEXT: [[TMP43:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP42]]) +; IEEE-NEXT: [[TMP44:%.*]] = select contract i1 [[TMP40]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP45:%.*]] = fmul contract float [[TMP43]], [[TMP44]] +; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP45]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: ret void ; @@ -1300,28 +2020,26 @@ ; DAZ-NEXT: [[TMP5:%.*]] = fdiv contract float 1.000000e+00, [[TMP4]] ; DAZ-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 -; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) ; DAZ-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 -; DAZ-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP9]]) ; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 -; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) ; DAZ-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 ; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; DAZ-NEXT: [[TMP15:%.*]] = fdiv contract float undef, [[TMP14]] +; DAZ-NEXT: [[TMP15:%.*]] = fdiv contract float undef, [[TMP14]], !fpmath !2 ; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP15]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 -; DAZ-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP16]]) ; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 -; DAZ-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 -; DAZ-NEXT: [[TMP20:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; DAZ-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP20:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP19]]) ; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP20]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void @@ -1367,18 +2085,52 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp(ptr addrspace(1) %out, float %x, float %y, float %denom) { -; IEEE-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP12]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[ARCP0:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[X]], float [[DENOM]]) -; DAZ-NEXT: [[ARCP1:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[Y]], float [[DENOM]]) +; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP1]] +; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP2]] ; DAZ-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void @@ -1391,21 +2143,70 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_x3(ptr addrspace(1) %out, float %x, float %y, float %z, float %denom) { -; IEEE-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_x3 -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: [[ARCP2:%.*]] = fdiv arcp float [[Z]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_x3 +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp float [[Z]], [[TMP18]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_x3 +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp float [[Z]], [[TMP18]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_x3 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[ARCP0:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[X]], float [[DENOM]]) -; DAZ-NEXT: [[ARCP1:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[Y]], float [[DENOM]]) -; DAZ-NEXT: [[ARCP2:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[Z]], float [[DENOM]]) +; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP1]] +; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP2]] +; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP2:%.*]] = fmul arcp float [[Z]], [[TMP3]] ; DAZ-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 @@ -1421,17 +2222,38 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_nomd(ptr addrspace(1) %out, float %x, float %y, float %denom) { -; IEEE-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_nomd -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]] -; IEEE-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_nomd +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_nomd +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_25ulp_nomd ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[ARCP0:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[X]], float [[DENOM]]) +; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP1]] ; DAZ-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]] ; DAZ-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 @@ -1445,18 +2267,39 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_denom_nomd_25ulp(ptr addrspace(1) %out, float %x, float %y, float %denom) { -; IEEE-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_nomd_25ulp -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]] -; IEEE-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_nomd_25ulp +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]] +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_nomd_25ulp +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]] +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP6]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_nomd_25ulp ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]] -; DAZ-NEXT: [[ARCP1:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[Y]], float [[DENOM]]) +; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP1]] ; DAZ-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void @@ -1469,13 +2312,55 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp(ptr addrspace(1) %out, float %x, float %y, float %denom) { -; CHECK-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ARCP0:%.*]] = fdiv arcp float [[X]], [[DENOM]], !fpmath !2 -; CHECK-NEXT: [[ARCP1:%.*]] = fdiv arcp float [[Y]], [[DENOM]], !fpmath !2 -; CHECK-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP12]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[DENOM:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp float [[X]], [[TMP1]] +; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP1:%.*]] = fmul arcp float [[Y]], [[TMP2]] +; DAZ-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; DAZ-NEXT: ret void ; %arcp0 = fdiv arcp float %x, %denom, !fpmath !2 %arcp1 = fdiv arcp float %y, %denom, !fpmath !2 @@ -1485,27 +2370,123 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector(ptr addrspace(1) %out, <2 x float> %x, <2 x float> %y, <2 x float> %denom) { -; CHECK-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[DENOM:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fdiv arcp float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = fdiv arcp float [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = fdiv arcp float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = fdiv arcp float [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP14]], i64 1 -; CHECK-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul arcp float [[TMP20]], [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul arcp float [[TMP30]], [[TMP37]] +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul arcp float [[TMP20]], [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul arcp float [[TMP30]], [[TMP37]] +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: ret void +; +; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[DENOM:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP4:%.*]] = fmul arcp float [[TMP1]], [[TMP3]] +; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; DAZ-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP6]], [[TMP8]] +; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP9]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; DAZ-NEXT: [[TMP13:%.*]] = fmul arcp float [[TMP10]], [[TMP12]] +; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 +; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; DAZ-NEXT: [[TMP18:%.*]] = fmul arcp float [[TMP15]], [[TMP17]] +; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP18]], i64 1 +; DAZ-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 +; DAZ-NEXT: ret void ; %arcp0 = fdiv arcp <2 x float> %x, %denom, !fpmath !2 %arcp1 = fdiv arcp <2 x float> %y, %denom, !fpmath !2 @@ -1515,20 +2496,55 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp(ptr addrspace(1) %out, float %x, float %y, float %sqr.denom) { -; IEEE-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; IEEE-NEXT: [[ARCP0:%.*]] = fdiv arcp contract float [[X]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: [[ARCP1:%.*]] = fdiv arcp contract float [[Y]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; DAZ-NEXT: [[ARCP0:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[X]], float [[DENOM]]) -; DAZ-NEXT: [[ARCP1:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[Y]], float [[DENOM]]) +; DAZ-NEXT: [[TMP1:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP1]] +; DAZ-NEXT: [[TMP2:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP2]] ; DAZ-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void @@ -1542,48 +2558,123 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp(ptr addrspace(1) %out, <2 x float> %x, <2 x float> %y, <2 x float> %sqr.denom) { -; IEEE-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-NEXT: [[TMP3:%.*]] = fdiv arcp contract float [[TMP1]], [[TMP2]] -; IEEE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 -; IEEE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = fdiv arcp contract float [[TMP5]], [[TMP6]] -; IEEE-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 -; IEEE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-NEXT: [[TMP10:%.*]] = fdiv arcp contract float [[TMP8]], [[TMP9]] -; IEEE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 -; IEEE-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-NEXT: [[TMP14:%.*]] = fdiv arcp contract float [[TMP12]], [[TMP13]] -; IEEE-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP14]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fmul arcp contract float [[TMP1]], [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fmul arcp contract float [[TMP11]], [[TMP18]] +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul arcp contract float [[TMP20]], [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul arcp contract float [[TMP30]], [[TMP37]] +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fmul arcp contract float [[TMP1]], [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fmul arcp contract float [[TMP11]], [[TMP18]] +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul arcp contract float [[TMP20]], [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP33]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul arcp contract float [[TMP30]], [[TMP37]] +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_vector_25ulp ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 ; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP3:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[TMP1]], float [[TMP2]]) -; DAZ-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 -; DAZ-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[TMP5]], float [[TMP6]]) -; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[TMP8]], float [[TMP9]]) -; DAZ-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 -; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP14:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[TMP12]], float [[TMP13]]) -; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP14]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP4:%.*]] = fmul arcp contract float [[TMP1]], [[TMP3]] +; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; DAZ-NEXT: [[TMP9:%.*]] = fmul arcp contract float [[TMP6]], [[TMP8]] +; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP9]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; DAZ-NEXT: [[TMP13:%.*]] = fmul arcp contract float [[TMP10]], [[TMP12]] +; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 +; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; DAZ-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP15]], [[TMP17]] +; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP18]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void @@ -1597,23 +2688,73 @@ } define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3(ptr addrspace(1) %out, float %x, float %y, float %z, float %sqr.denom) { -; IEEE-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3 -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; IEEE-NEXT: [[ARCP0:%.*]] = fdiv arcp contract float [[X]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: [[ARCP1:%.*]] = fdiv arcp contract float [[Y]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: [[ARCP2:%.*]] = fdiv arcp contract float [[Z]], [[DENOM]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3 +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp contract float [[Z]], [[TMP18]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3 +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[DENOM]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[ARCP2:%.*]] = fmul arcp contract float [[Z]], [[TMP18]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_sqrt_denom_25ulp_x3 ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[SQR_DENOM:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract float @llvm.sqrt.f32(float [[SQR_DENOM]]), !fpmath !3 -; DAZ-NEXT: [[ARCP0:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[X]], float [[DENOM]]) -; DAZ-NEXT: [[ARCP1:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[Y]], float [[DENOM]]) -; DAZ-NEXT: [[ARCP2:%.*]] = call arcp contract float @llvm.amdgcn.fdiv.fast(float [[Z]], float [[DENOM]]) +; DAZ-NEXT: [[TMP1:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP0:%.*]] = fmul arcp contract float [[X]], [[TMP1]] +; DAZ-NEXT: [[TMP2:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP1:%.*]] = fmul arcp contract float [[Y]], [[TMP2]] +; DAZ-NEXT: [[TMP3:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[DENOM]]) +; DAZ-NEXT: [[ARCP2:%.*]] = fmul arcp contract float [[Z]], [[TMP3]] ; DAZ-NEXT: store volatile float [[ARCP0]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP1]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: store volatile float [[ARCP2]], ptr addrspace(1) [[OUT]], align 4 @@ -1633,35 +2774,45 @@ ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv contract float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv contract float -1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-NEXT: [[TMP8:%.*]] = fdiv contract float 4.000000e+00, [[TMP7]] -; IEEE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; IEEE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-NEXT: [[TMP11:%.*]] = fdiv contract float undef, [[TMP10]] -; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = fcmp contract olt float [[TMP1]], 0x3810000000000000 +; IEEE-NEXT: [[TMP3:%.*]] = select contract i1 [[TMP2]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP4:%.*]] = fmul contract float [[TMP1]], [[TMP3]] +; IEEE-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; IEEE-NEXT: [[TMP6:%.*]] = select contract i1 [[TMP2]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP7:%.*]] = fmul contract float [[TMP5]], [[TMP6]] +; IEEE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-NEXT: [[TMP10:%.*]] = fcmp contract olt float [[TMP9]], 0x3810000000000000 +; IEEE-NEXT: [[TMP11:%.*]] = select contract i1 [[TMP10]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP12:%.*]] = fmul contract float [[TMP9]], [[TMP11]] +; IEEE-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP10]], float -4.096000e+03, float -1.000000e+00 +; IEEE-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP14]] +; IEEE-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-NEXT: [[TMP18:%.*]] = fdiv contract float 4.000000e+00, [[TMP17]], !fpmath !2 +; IEEE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP18]], i64 2 +; IEEE-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-NEXT: [[TMP21:%.*]] = fdiv contract float undef, [[TMP20]], !fpmath !2 +; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP21]], i64 3 ; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP1]]) ; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fneg contract float [[TMP4]] -; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP5]]) +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP6:%.*]] = fneg contract float [[TMP5]] ; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 ; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]] +; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]], !fpmath !2 ; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 ; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]] +; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]], !fpmath !2 ; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; @@ -1671,66 +2822,74 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float> %arg) { -; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt +; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt +; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = fneg contract afn float [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]], !fpmath !2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]], !fpmath !2 +; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 +; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; + %denom = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg) + %partial.rsq = fdiv contract <4 x float> , %denom, !fpmath !2 + ret <4 x float> %partial.rsq +} + +define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(<4 x float> %arg) { +; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div ; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv contract float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv contract float -1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-NEXT: [[TMP8:%.*]] = fdiv contract float 4.000000e+00, [[TMP7]] -; IEEE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; IEEE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-NEXT: [[TMP11:%.*]] = fdiv contract float undef, [[TMP10]] -; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 +; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = fcmp contract afn olt float [[TMP1]], 0x3810000000000000 +; IEEE-NEXT: [[TMP3:%.*]] = select contract afn i1 [[TMP2]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP4:%.*]] = fmul contract afn float [[TMP1]], [[TMP3]] +; IEEE-NEXT: [[TMP5:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; IEEE-NEXT: [[TMP6:%.*]] = select contract afn i1 [[TMP2]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP7:%.*]] = fmul contract afn float [[TMP5]], [[TMP6]] +; IEEE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-NEXT: [[TMP10:%.*]] = fcmp contract afn olt float [[TMP9]], 0x3810000000000000 +; IEEE-NEXT: [[TMP11:%.*]] = select contract afn i1 [[TMP10]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP12:%.*]] = fmul contract afn float [[TMP9]], [[TMP11]] +; IEEE-NEXT: [[TMP13:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP12]]) +; IEEE-NEXT: [[TMP14:%.*]] = select contract afn i1 [[TMP10]], float -4.096000e+03, float -1.000000e+00 +; IEEE-NEXT: [[TMP15:%.*]] = fmul contract afn float [[TMP13]], [[TMP14]] +; IEEE-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-NEXT: [[TMP18:%.*]] = fdiv contract afn float 4.000000e+00, [[TMP17]] +; IEEE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP18]], i64 2 +; IEEE-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-NEXT: [[TMP21:%.*]] = fdiv contract afn float undef, [[TMP20]] +; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP21]], i64 3 ; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; -; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt +; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP1]]) ; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fneg contract float [[TMP4]] -; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP5]]) +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP6:%.*]] = fneg contract afn float [[TMP5]] ; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 ; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]] +; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract afn float 4.000000e+00, [[TMP8]] ; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 ; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]] +; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract afn float undef, [[TMP11]] ; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] -; - %denom = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg) - %partial.rsq = fdiv contract <4 x float> , %denom, !fpmath !2 - ret <4 x float> %partial.rsq -} - -define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(<4 x float> %arg) { -; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div -; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fneg contract afn float [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = fmul contract afn float 4.000000e+00, [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP10]], i64 2 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; CHECK-NEXT: [[TMP13:%.*]] = call contract afn float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = fmul contract afn float undef, [[TMP13]] -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP14]], i64 3 -; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 %partial.rsq = fdiv contract afn <4 x float> , %denom @@ -1761,22 +2920,61 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x float> %arg) { -; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt -; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv contract float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv contract float -1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-NEXT: [[TMP8:%.*]] = fdiv contract float 4.000000e+00, [[TMP7]] -; IEEE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; IEEE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-NEXT: [[TMP11:%.*]] = fdiv contract float undef, [[TMP10]] -; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 -; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt +; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fneg contract float [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP11]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float 4.000000e+00, [[TMP18]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP19]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = fdiv contract float undef, [[TMP21]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP22]], i64 3 +; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt +; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fneg contract float [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float 4.000000e+00, [[TMP18]], !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP19]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = fdiv contract float undef, [[TMP21]], !fpmath !2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP22]], i64 3 +; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { @@ -1789,10 +2987,10 @@ ; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP5]]) ; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 ; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]] +; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]], !fpmath !2 ; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 ; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]] +; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]], !fpmath !2 ; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; @@ -1802,39 +3000,102 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %arg) { -; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp -; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv arcp contract float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv arcp contract float -1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-NEXT: [[TMP8:%.*]] = fdiv arcp contract float 4.000000e+00, [[TMP7]] -; IEEE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; IEEE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-NEXT: [[TMP11:%.*]] = fdiv arcp contract float undef, [[TMP10]] -; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 -; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp +; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = fcmp arcp contract olt float [[TMP1]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = select arcp contract i1 [[TMP2]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = fmul arcp contract float [[TMP1]], [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = select arcp contract i1 [[TMP2]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = fmul arcp contract float [[TMP5]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fcmp arcp contract olt float [[TMP9]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = select arcp contract i1 [[TMP10]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = fmul arcp contract float [[TMP9]], [[TMP11]] +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = select arcp contract i1 [[TMP10]], float -4.096000e+03, float -1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP13]], [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP23]] +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP24]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP27]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = sub i32 0, [[TMP29]] +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP28]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP31]], i32 [[TMP30]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = fmul arcp contract float undef, [[TMP32]] +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP33]], i64 3 +; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp +; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = fcmp arcp contract olt float [[TMP1]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = select arcp contract i1 [[TMP2]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = fmul arcp contract float [[TMP1]], [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = select arcp contract i1 [[TMP2]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = fmul arcp contract float [[TMP5]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fcmp arcp contract olt float [[TMP9]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = select arcp contract i1 [[TMP10]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = fmul arcp contract float [[TMP9]], [[TMP11]] +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = select arcp contract i1 [[TMP10]], float -4.096000e+03, float -1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP13]], [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP23]] +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP24]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = sub i32 0, [[TMP29]] +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP31]], i32 [[TMP30]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = fmul arcp contract float undef, [[TMP32]] +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP33]], i64 3 +; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP1]]) ; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fneg arcp contract float [[TMP4]] -; DAZ-NEXT: [[TMP6:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP5]]) +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP6:%.*]] = fneg arcp contract float [[TMP5]] ; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 ; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv arcp contract float 4.000000e+00, [[TMP8]] -; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv arcp contract float undef, [[TMP11]] -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 +; DAZ-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; DAZ-NEXT: [[TMP10:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP9]] +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP10]], i64 2 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; DAZ-NEXT: [[TMP13:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP14:%.*]] = fmul arcp contract float undef, [[TMP13]] +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP14]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -1866,21 +3127,83 @@ } define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp(<4 x float> %arg) { -; IEEE-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp -; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv arcp float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv arcp float -1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-NEXT: [[TMP8:%.*]] = fdiv arcp float 4.000000e+00, [[TMP7]] -; IEEE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; IEEE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-NEXT: [[TMP11:%.*]] = fdiv arcp float undef, [[TMP10]] -; IEEE-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 -; IEEE-NEXT: ret <4 x float> [[PARTIAL_RCP]] +; IEEE-GOODFREXP-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp +; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fneg arcp float [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP11]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = fmul arcp float 4.000000e+00, [[TMP24]] +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP25]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP27]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = fmul arcp float undef, [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP34]], i64 3 +; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RCP]] +; +; IEEE-BADFREXP-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp +; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fneg arcp float [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = fmul arcp float 4.000000e+00, [[TMP24]] +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP25]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = fmul arcp float undef, [[TMP33]] +; IEEE-BADFREXP-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP34]], i64 3 +; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RCP]] ; ; DAZ-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { @@ -1892,11 +3215,13 @@ ; DAZ-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP5]]) ; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 ; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv arcp float 4.000000e+00, [[TMP8]] -; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv arcp float undef, [[TMP11]] -; DAZ-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 +; DAZ-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; DAZ-NEXT: [[TMP10:%.*]] = fmul arcp float 4.000000e+00, [[TMP9]] +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP10]], i64 2 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP13:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP14:%.*]] = fmul arcp float undef, [[TMP13]] +; DAZ-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP14]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RCP]] ; %partial.rcp = fdiv arcp <4 x float> , %arg, !fpmath !2 @@ -1926,40 +3251,21 @@ ; Make sure we don't crash if a vector square root has a constant vecctor input define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float> %x) { -; IEEE-LABEL: define <4 x float> @rsq_f32_vector_const_denom -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv contract float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv contract float -1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 -; IEEE-NEXT: [[TMP8:%.*]] = fdiv contract float undef, [[TMP7]] -; IEEE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; IEEE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; IEEE-NEXT: [[TMP11:%.*]] = fdiv contract float 2.000000e+00, [[TMP10]] -; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 -; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] -; -; DAZ-LABEL: define <4 x float> @rsq_f32_vector_const_denom -; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fneg contract float [[TMP4]] -; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP5]]) -; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float undef, [[TMP8]] -; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float 2.000000e+00, [[TMP11]] -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 -; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; CHECK-LABEL: define <4 x float> @rsq_f32_vector_const_denom +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = fneg contract float [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = fdiv contract float undef, [[TMP6]], !fpmath !2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = fdiv contract float 2.000000e+00, [[TMP9]], !fpmath !2 +; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP10]], i64 3 +; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 %partial.rsq = fdiv contract <4 x float> , %sqrt, !fpmath !2 @@ -1967,9 +3273,45 @@ } define <4 x float> @fdiv_constant_f32_vector(ptr addrspace(1) %out, <2 x float> %x) { -; IEEE-LABEL: define <4 x float> @fdiv_constant_f32_vector -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: ret <4 x float> +; IEEE-GOODFREXP-LABEL: define <4 x float> @fdiv_constant_f32_vector +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 5.000000e-01) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float -2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP13]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float 0x7FF8000000000000, i64 2 +; IEEE-GOODFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP15]], float 0x3FC99999A0000000, i64 3 +; IEEE-GOODFREXP-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] +; +; IEEE-BADFREXP-LABEL: define <4 x float> @fdiv_constant_f32_vector +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 5.000000e-01) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 5.000000e-01) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float -2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float -2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP13]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float 0x7FF8000000000000, i64 2 +; IEEE-BADFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP15]], float 0x3FC99999A0000000, i64 3 +; IEEE-BADFREXP-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] ; ; DAZ-LABEL: define <4 x float> @fdiv_constant_f32_vector ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { @@ -1986,31 +3328,79 @@ } define amdgpu_kernel void @fdiv_fpmath_f32_nosub_lhs(ptr addrspace(1) %out, float nofpclass(sub) %a, float %b) { -; IEEE-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_lhs -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] -; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 -; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 -; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) -; IEEE-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] -; IEEE-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP2:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[B]]) -; IEEE-NEXT: [[AFN_MD_25ULP:%.*]] = fmul afn float [[A]], [[TMP2]] -; IEEE-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] -; IEEE-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[ARCP_MD_25ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[ARCP_MD_1ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_lhs +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] +; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[AFN_MD_25ULP:%.*]] = fdiv afn float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP7]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_lhs +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] +; IEEE-BADFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[AFN_MD_25ULP:%.*]] = fdiv afn float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP7]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP13]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_lhs ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { @@ -2027,14 +3417,15 @@ ; DAZ-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) ; DAZ-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] ; DAZ-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP2:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[AFN_MD_25ULP:%.*]] = fmul afn float [[A]], [[TMP2]] +; DAZ-NEXT: [[AFN_MD_25ULP:%.*]] = call afn float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; DAZ-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) +; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP2]] ; DAZ-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP3]] ; DAZ-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -2062,31 +3453,79 @@ } define amdgpu_kernel void @fdiv_fpmath_f32_nosub_rhs(ptr addrspace(1) %out, float %a, float nofpclass(sub) %b) { -; IEEE-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_rhs -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float nofpclass(sub) [[B:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] -; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 -; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 -; IEEE-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) -; IEEE-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] -; IEEE-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP2:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[B]]) -; IEEE-NEXT: [[AFN_MD_25ULP:%.*]] = fmul afn float [[A]], [[TMP2]] -; IEEE-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] -; IEEE-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[ARCP_MD_25ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[ARCP_MD_1ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_rhs +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float nofpclass(sub) [[B:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] +; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[AFN_MD_25ULP:%.*]] = fdiv afn float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] +; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP7]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_rhs +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float nofpclass(sub) [[B:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[NO_MD:%.*]] = fdiv float [[A]], [[B]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-BADFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] +; IEEE-BADFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[AFN_MD_25ULP:%.*]] = fdiv afn float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] +; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP7]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP13]] +; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_nosub_rhs ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float nofpclass(sub) [[B:%.*]]) #[[ATTR1]] { @@ -2103,14 +3542,15 @@ ; DAZ-NEXT: [[TMP1:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[B]]) ; DAZ-NEXT: [[FAST_MD_25ULP:%.*]] = fmul fast float [[A]], [[TMP1]] ; DAZ-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP2:%.*]] = call afn float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[AFN_MD_25ULP:%.*]] = fmul afn float [[A]], [[TMP2]] +; DAZ-NEXT: [[AFN_MD_25ULP:%.*]] = call afn float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; DAZ-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = call arcp float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) +; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP2]] ; DAZ-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fdiv arcp float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP3]] ; DAZ-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -2149,6 +3589,3 @@ !1 = !{float 5.000000e-01} !2 = !{float 1.000000e+00} !3 = !{float 3.000000e+00} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; IEEE-BADFREXP: {{.*}} -; IEEE-GOODFREXP: {{.*}} Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -1242,69 +1242,42 @@ ; GFX67: ; %bb.0: ; %entry ; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_cmp_gt_f32_e64 vcc, |s6|, v0 -; GFX67-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX67-NEXT: v_cmp_gt_f32_e64 vcc, |s7|, v0 -; GFX67-NEXT: v_mul_f32_e32 v3, s6, v2 -; GFX67-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX67-NEXT: v_rcp_f32_e32 v3, v3 -; GFX67-NEXT: v_mul_f32_e32 v0, s7, v1 -; GFX67-NEXT: v_rcp_f32_e32 v4, v0 ; GFX67-NEXT: s_mov_b32 s2, -1 -; GFX67-NEXT: v_mul_f32_e32 v0, s4, v3 -; GFX67-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX67-NEXT: v_mul_f32_e32 v2, s5, v4 -; GFX67-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) +; GFX67-NEXT: v_rcp_f32_e32 v0, s6 +; GFX67-NEXT: v_rcp_f32_e32 v1, s7 +; GFX67-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX67-NEXT: v_mul_f32_e32 v1, s5, v1 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e64 vcc, |s6|, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e64 vcc, |s7|, v0 -; GFX8-NEXT: v_mul_f32_e32 v3, s6, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v0, s7, v1 -; GFX8-NEXT: v_rcp_f32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, s4, v3 -; GFX8-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX8-NEXT: v_mul_f32_e32 v2, s5, v4 -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_rcp_f32_e32 v0, s6 +; GFX8-NEXT: v_rcp_f32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, s5, v1 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x6f800000, |s6| -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s2 -; GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x6f800000, |s7| -; GFX10-NEXT: v_mul_f32_e32 v2, s6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s2 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, s7, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, s4, v2 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, s5, v3 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s6 +; GFX10-NEXT: v_rcp_f32_e32 v1, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: @@ -1313,18 +1286,12 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x6f800000, |s6| -; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s2 -; GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x6f800000, |s7| -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s2 -; GFX11-NEXT: v_dual_mul_f32 v2, s6, v0 :: v_dual_mul_f32 v3, s7, v1 -; GFX11-NEXT: v_rcp_f32_e32 v2, v2 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v0, s6 +; GFX11-NEXT: v_rcp_f32_e32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v2, s4, v2 :: v_dual_mul_f32 v3, s5, v3 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 -; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -6,18 +6,16 @@ ; GCN-DENORM-LABEL: div_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, s2, v0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_1_by_x_25ulp: @@ -40,18 +38,16 @@ ; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_mul_f32_e64 v1, s2, -v0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp: @@ -74,18 +70,16 @@ ; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_mul_f32_e64 v1, -s2, v0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp: @@ -109,18 +103,16 @@ ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, s2, v0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp: @@ -141,6 +133,49 @@ } define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_v4_1_by_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v2, s1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v1, s0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v3, s1 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v5, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v1, 0, v1 +; GCN-DENORM-NEXT: v_sub_u32_e32 v3, 0, v3 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v1 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s3 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v6 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 +; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_v4_1_by_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, s1 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, s2 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, s3 +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 store <4 x float> %div, ptr addrspace(1) %arg, align 16 @@ -151,32 +186,30 @@ ; GCN-DENORM-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s1|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v9, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_mul_f32_e64 v5, s0, -v2 -; GCN-DENORM-NEXT: v_mul_f32_e64 v6, s1, -v3 -; GCN-DENORM-NEXT: v_mul_f32_e64 v8, s2, -v7 -; GCN-DENORM-NEXT: v_mul_f32_e64 v0, s3, -v9 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v2, -s1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v1, s0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v3, s1 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v5, -s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v1, 0, v1 +; GCN-DENORM-NEXT: v_sub_u32_e32 v3, 0, v3 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v6, v6 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v8, v8 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v10, v0 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v2, v5 -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v6 -; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v7, v8 -; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v9, v10 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v1 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v3, -s3 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v6 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 +; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 ; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GCN-DENORM-NEXT: s_endpgm ; @@ -203,32 +236,30 @@ ; GCN-DENORM-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s1|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v9, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_mul_f32_e64 v5, -s0, v2 -; GCN-DENORM-NEXT: v_mul_f32_e64 v6, -s1, v3 -; GCN-DENORM-NEXT: v_mul_f32_e64 v8, -s2, v7 -; GCN-DENORM-NEXT: v_mul_f32_e64 v0, -s3, v9 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v2, -s1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v1, s0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v3, s1 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v5, -s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v1, 0, v1 +; GCN-DENORM-NEXT: v_sub_u32_e32 v3, 0, v3 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v6, v6 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v8, v8 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v10, v0 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v2, v5 -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v6 -; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v7, v8 -; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v9, v10 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v1 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v3, -s3 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v6 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 +; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 ; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GCN-DENORM-NEXT: s_endpgm ; @@ -256,32 +287,30 @@ ; GCN-DENORM-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s1|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v9, 1.0, v1, vcc -; GCN-DENORM-NEXT: v_mul_f32_e32 v5, s0, v2 -; GCN-DENORM-NEXT: v_mul_f32_e32 v6, s1, v3 -; GCN-DENORM-NEXT: v_mul_f32_e32 v8, s2, v7 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, s3, v9 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v2, s1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v1, s0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v3, s1 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v5, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v1, 0, v1 +; GCN-DENORM-NEXT: v_sub_u32_e32 v3, 0, v3 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v6, v6 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v8, v8 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v10, v0 -; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v2, v5 -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v6 -; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v7, v8 -; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v9, v10 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v1 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s3 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v6 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 +; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 ; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GCN-DENORM-NEXT: s_endpgm ; @@ -333,22 +362,20 @@ ; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v4, v6 ; GCN-DENORM-NEXT: s_mov_b64 vcc, s[0:1] ; GCN-DENORM-NEXT: v_div_fmas_f32 v3, v1, v5, v7 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s5|, v1 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s6|, v1 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, s5, v4 -; GCN-DENORM-NEXT: v_mul_f32_e64 v5, s6, -v2 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s5 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v5, -s6 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-DENORM-NEXT: v_mov_b32_e32 v6, 0 -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v4, v1 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s5 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s6 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v6, 0, v6 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v6 ; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 2.0 -; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v2, v5 ; GCN-DENORM-NEXT: v_div_fixup_f32 v3, v3, s7, -2.0 -; GCN-DENORM-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp: @@ -410,22 +437,20 @@ ; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v4, v6 ; GCN-DENORM-NEXT: s_mov_b64 vcc, s[0:1] ; GCN-DENORM-NEXT: v_div_fmas_f32 v3, v1, v5, v7 -; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x6f800000 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s5|, v1 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s6|, v1 -; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-DENORM-NEXT: v_mul_f32_e64 v1, -s5, v4 -; GCN-DENORM-NEXT: v_mul_f32_e32 v5, s6, v2 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v1, -s5 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v5, s6 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-DENORM-NEXT: v_mov_b32_e32 v6, 0 -; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v4, v1 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s5 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s6 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v6, 0, v6 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v6 ; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -2.0 -; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v2, v5 ; GCN-DENORM-NEXT: v_div_fixup_f32 v3, v3, -s7, -2.0 -; GCN-DENORM-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp: Index: llvm/test/CodeGen/AMDGPU/rcp-pattern.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -274,17 +274,28 @@ } define float @v_rcp_f32_ieee_ulp25(float %x) #3 { -; GCN-LABEL: v_rcp_f32_ieee_ulp25: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_rcp_f32_ieee_ulp25: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_rcp_f32_e32 v1, v1 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_f32_ieee_ulp25: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; VI-NEXT: v_rcp_f32_e32 v1, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_ulp25: ; R600: ; %bb.0: @@ -295,17 +306,28 @@ } define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { -; GCN-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_rcp_f32_e32 v1, v1 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; VI-NEXT: v_rcp_f32_e32 v1, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: ; R600: ; %bb.0: @@ -316,17 +338,28 @@ } define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { -; GCN-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e64 v0, v0, -v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] +; SI-NEXT: v_rcp_f32_e32 v1, v1 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; VI-NEXT: v_rcp_f32_e32 v1, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: ; R600: ; %bb.0: @@ -337,17 +370,28 @@ } define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 { -; GCN-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_rcp_f32_e32 v1, v1 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; VI-NEXT: v_rcp_f32_e32 v1, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: ; R600: ; %bb.0: @@ -373,17 +417,28 @@ } define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 { -; GCN-LABEL: v_neg_rcp_f32_ieee_ulp25: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e64 v0, v0, -v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_neg_rcp_f32_ieee_ulp25: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] +; SI-NEXT: v_rcp_f32_e32 v1, v1 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_neg_rcp_f32_ieee_ulp25: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; VI-NEXT: v_rcp_f32_e32 v1, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25: ; R600: ; %bb.0: @@ -501,17 +556,28 @@ } define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 { -; GCN-LABEL: v_rcp_fabs_f32_ieee_ulp25: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0| +; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5] +; SI-NEXT: v_rcp_f32_e32 v1, v1 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_fabs_f32_ieee_ulp25: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_frexp_mant_f32_e64 v1, |v0| +; VI-NEXT: v_rcp_f32_e32 v1, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25: ; R600: ; %bb.0: @@ -631,17 +697,28 @@ } define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 { -; GCN-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e64 v0, |v0|, -v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_frexp_mant_f32_e64 v1, -|v0| +; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5] +; SI-NEXT: v_rcp_f32_e32 v1, v1 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_frexp_mant_f32_e64 v1, -|v0| +; VI-NEXT: v_rcp_f32_e32 v1, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: ; R600: ; %bb.0: Index: llvm/test/CodeGen/AMDGPU/rsq.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -54,30 +54,54 @@ ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; -; GCN-IEEE-SAFE-LABEL: rsq_f32: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 -; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; GCN-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-IEEE-SAFE-NEXT: s_endpgm +; SI-IEEE-SAFE-LABEL: rsq_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: rsq_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 @@ -125,23 +149,40 @@ ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; -; GCN-IEEE-SAFE-LABEL: rsq_f32_sgpr: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, -1 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-IEEE-SAFE-NEXT: s_endpgm +; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, -1 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, -1 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb @@ -203,9 +244,9 @@ ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-DAZ-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2 +; GCN-DAZ-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-DAZ-UNSAFE-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm @@ -227,9 +268,9 @@ ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm @@ -319,24 +360,24 @@ } define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { -; GCN-DAZ-LABEL: neg_rsq_f32: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-NEXT: s_endpgm +; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: @@ -352,35 +393,78 @@ ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; -; GCN-IEEE-SAFE-LABEL: neg_rsq_f32: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 -; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; GCN-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-SAFE-LABEL: neg_rsq_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; SI-IEEE-SAFE-LABEL: neg_rsq_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: neg_rsq_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_f32: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 @@ -407,24 +491,24 @@ } define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { -; GCN-DAZ-LABEL: neg_rsq_neg_f32: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-NEXT: s_endpgm +; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: @@ -440,35 +524,78 @@ ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; -; GCN-IEEE-SAFE-LABEL: neg_rsq_neg_f32: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 -; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 -; GCN-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 @@ -496,32 +623,51 @@ } define float @v_neg_rsq_neg_f32(float %val) { -; GCN-DAZ-LABEL: v_neg_rsq_neg_f32: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %val.fneg = fneg float %val %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 @@ -529,42 +675,71 @@ } define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) { -; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_sqrt_f32_e64 v1, -v1 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v1, -v1 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v3 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v2 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v3, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v2, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v3, -v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v3 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %val.fneg = fneg <2 x float> %val %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg) %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 @@ -572,35 +747,54 @@ } define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) { -; GCN-DAZ-LABEL: v_neg_rsq_neg_f32_foldable_user: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v2 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v2, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %val0.neg = fneg float %val0 %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 @@ -609,48 +803,77 @@ } define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { -; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32_foldable_user: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_sqrt_f32_e64 v1, -v1 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v1, -v1 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v4, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v5 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v4 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v5, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v4, v1 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %val0.fneg = fneg <2 x float> %val0 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg) %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 @@ -659,109 +882,176 @@ } define float @v_neg_rsq_f32(float %val) { -; GCN-DAZ-LABEL: v_neg_rsq_f32: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_f32: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 ret float %div } define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) { -; GCN-DAZ-LABEL: v_neg_rsq_v2f32: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v3 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v2 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v3, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v2, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v3, -v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v3 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val) %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 ret <2 x float> %div } define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) { -; GCN-DAZ-LABEL: v_neg_rsq_f32_foldable_user: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v2 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v2, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 %user = fmul contract float %div, %val1 @@ -769,48 +1059,77 @@ } define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { -; GCN-DAZ-LABEL: v_neg_rsq_v2f32_foldable_user: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user: ; GCN-IEEE-UNSAFE: ; %bb.0: ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v4, 0x2f800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v5 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v4 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v5, v0 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v4, v1 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0) %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 %user = fmul contract <2 x float> %div, %val1 @@ -833,18 +1152,15 @@ ; GCN-IEEE-SAFE-LABEL: v_rsq_f32: ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v1 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4b800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x45800000 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 @@ -852,12 +1168,84 @@ } define float @v_rsq_f32_missing_contract0(float %val) { +; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract0: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 ret float %div } define float @v_rsq_f32_missing_contract1(float %val) { +; GCN-DAZ-LABEL: v_rsq_f32_missing_contract1: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract1: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 %div = fdiv float 1.0, %sqrt, !fpmath !1 ret float %div @@ -883,19 +1271,15 @@ ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user: ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v3, v2 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 @@ -922,19 +1306,15 @@ ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0: ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v3, v2 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 @@ -961,18 +1341,15 @@ ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1: ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v3, v2 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 @@ -988,10 +1365,8 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CI-DAZ-SAFE: {{.*}} ; CI-DAZ-UNSAFE: {{.*}} -; CI-IEEE-SAFE: {{.*}} ; CI-IEEE-UNSAFE: {{.*}} ; GCN-IEEE: {{.*}} ; SI-DAZ-SAFE: {{.*}} ; SI-DAZ-UNSAFE: {{.*}} -; SI-IEEE-SAFE: {{.*}} ; SI-IEEE-UNSAFE: {{.*}}