Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -147,6 +147,11 @@ DT); } + bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const { + return HasFP32DenormalFlush || + computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal(); + } + /// Promotes uniform binary operation \p I to equivalent 32 bit binary /// operation. /// @@ -247,13 +252,22 @@ Value *matchFractPat(IntrinsicInst &I); Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg); + bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF, + FastMathFlags SqrtFMF) const; + Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, FastMathFlags SqrtFMF, - const Instruction *CtxI, bool AllowApproxRsq) const; + const Instruction *CtxI) const; Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den, - FastMathFlags FMF, const Instruction *CtxI, - bool AllowInaccurateRcp, bool RcpIsAccurate) const; + FastMathFlags FMF, const Instruction *CtxI) const; + Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den, + float ReqdAccuracy) const; + + Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den, + FastMathFlags DivFMF, FastMathFlags SqrtFMF, + Value *RsqOp, const Instruction *FDiv, + float ReqdAccuracy) const; public: bool visitFDiv(BinaryOperator &I); @@ -815,13 +829,27 @@ return Builder.CreateFMul(Rsq, OutputScaleFactor); } +bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, + FastMathFlags DivFMF, + FastMathFlags SqrtFMF) const { + // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. + if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) + return false; + + // v_rsq_f32 gives 1ulp + return SqrtFMF.approxFunc() || HasUnsafeFPMath || + SqrtOp->getFPAccuracy() >= 1.0f; +} + Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, - FastMathFlags SqrtFMF, const Instruction *CtxI, bool AllowApproxRsq) const { + FastMathFlags SqrtFMF, const Instruction *CtxI) const { // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. - if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) - return nullptr; + assert(DivFMF.allowContract() && SqrtFMF.allowContract()); + // rsq_f16 is accurate to 0.51 ulp. + // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rsq_f64 is never accurate. const ConstantFP *CLHS = dyn_cast(Num); if (!CLHS) return nullptr; @@ -830,14 +858,16 @@ assert(Ty->isFloatTy()); bool IsNegative = false; + + // TODO: Handle other numerator values with arcp. if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) { // Add in the sqrt flags. IRBuilder<>::FastMathFlagGuard Guard(Builder); DivFMF |= SqrtFMF; Builder.setFastMathFlags(DivFMF); - if (HasFP32DenormalFlush || AllowApproxRsq || - computeKnownFPClass(Den, fcSubnormal, CtxI).isKnownNeverSubnormal()) { + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || + canIgnoreDenormalInput(Den, CtxI)) { Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); // -1.0 / sqrt(x) -> fneg(rsq(x)) return IsNegative ? Builder.CreateFNeg(Result) : Result; @@ -855,14 +885,13 @@ // allowed with unsafe-fp-math or afn. // // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 -Value *AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, - Value *Num, Value *Den, - FastMathFlags FMF, - const Instruction *CtxI, - bool AllowInaccurateRcp, - bool RcpIsAccurate) const { - assert(AllowInaccurateRcp || RcpIsAccurate); - +Value * +AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num, + Value *Den, FastMathFlags FMF, + const Instruction *CtxI) const { + // rcp_f16 is accurate to 0.51 ulp. + // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rcp_f64 is never accurate. Type *Ty = Den->getType(); assert(Ty->isFloatTy()); @@ -872,7 +901,7 @@ (IsNegative = CLHS->isExactlyValue(-1.0))) { Value *Src = Den; - if (HasFP32DenormalFlush || AllowInaccurateRcp) { + if (HasFP32DenormalFlush || FMF.approxFunc()) { // -1.0 / x -> 1.0 / fneg(x) if (IsNegative) Src = Builder.CreateFNeg(Src); @@ -902,7 +931,7 @@ // TODO: Could avoid denormal scaling and use raw rcp if we knew the output // will never underflow. - if (AllowInaccurateRcp || HasFP32DenormalFlush) { + if (HasFP32DenormalFlush || FMF.approxFunc()) { Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den); return Builder.CreateFMul(Num, Recip); } @@ -921,9 +950,8 @@ // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. // // NOTE: optimizeWithRcp should be tried first because rcp is the preference. -static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, - bool HasFP32DenormalFlush, - IRBuilder<> &Builder, Module *Mod) { +Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast( + IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const { // fdiv.fast can achieve 2.5 ULP accuracy. if (ReqdAccuracy < 2.5f) return nullptr; @@ -943,8 +971,25 @@ if (!HasFP32DenormalFlush && !NumIsOne) return nullptr; - Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - return Builder.CreateCall(Decl, { Num, Den }); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den}); +} + +Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( + IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, + FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst, + float ReqdDivAccuracy) const { + if (RsqOp) { + Value *Rsq = + optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst); + if (Rsq) + return Rsq; + } + + Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst); + if (Rcp) + return Rcp; + + return optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy); } // Optimizations is performed based on fpmath, fast math flags as well as @@ -975,8 +1020,7 @@ const FPMathOperator *FPOp = cast(&FDiv); const FastMathFlags DivFMF = FPOp->getFastMathFlags(); - - const float ReqdAccuracy = FPOp->getFPAccuracy(); + const float ReqdAccuracy = FPOp->getFPAccuracy(); // Inaccurate rcp is allowed with unsafe-fp-math or afn. // @@ -991,110 +1035,69 @@ if (AllowInaccurateRcp) return false; - bool AllowApproxRsq = false; + // Defer the correct implementations to codegen. + if (ReqdAccuracy < 1.0f) + return false; FastMathFlags SqrtFMF; - // rcp_f16 is accurate to 0.51 ulp. - // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. - // rcp_f64 is never accurate. - const bool RcpIsAccurate = ReqdAccuracy >= 1.0f; Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *RsqOp = nullptr; auto *DenII = dyn_cast(Den); if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && - DenII->hasOneUse() && (RcpIsAccurate || AllowInaccurateRcp)) { + DenII->hasOneUse()) { const auto *SqrtOp = cast(DenII); - AllowApproxRsq = HasUnsafeFPMath || SqrtOp->hasApproxFunc(); - - if (AllowApproxRsq || SqrtOp->getFPAccuracy() >= 1.0f) { - SqrtFMF = SqrtOp->getFastMathFlags(); + SqrtFMF = SqrtOp->getFastMathFlags(); + if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) RsqOp = SqrtOp->getOperand(0); - } } IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(DivFMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - Value *NewFDiv = nullptr; - if (auto *VT = dyn_cast(FDiv.getType())) { - NewFDiv = PoisonValue::get(VT); - - // FIXME: Doesn't do the right thing for cases where the vector is partially - // constant. This works when the scalarizer pass is run first. - for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { - Value *NumEltI = Builder.CreateExtractElement(Num, I); - - Value *NewElt = nullptr; - if (RsqOp) { - Value *DenEltI = Builder.CreateExtractElement(RsqOp, I); - NewElt = optimizeWithRsq(Builder, NumEltI, DenEltI, DivFMF, SqrtFMF, - &FDiv, AllowApproxRsq); - if (!NewElt) { - // TODO: Avoid inserting dead extract in the first place - if (Instruction *Extract = dyn_cast(DenEltI)) - Extract->eraseFromParent(); - } - } - - Value *DenEltI = nullptr; - - if (!NewElt && (RcpIsAccurate || AllowInaccurateRcp)) { - DenEltI = Builder.CreateExtractElement(Den, I); - - // Try rcp first. - NewElt = optimizeWithRcp(Builder, NumEltI, DenEltI, DivFMF, - cast(FPOp), AllowInaccurateRcp, - RcpIsAccurate); - if (!NewElt) // Try fdiv.fast. - NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, - HasFP32DenormalFlush, Builder, Mod); - } - - if (!NewElt) { - if (!DenEltI) - DenEltI = Builder.CreateExtractElement(Den, I); - - // Keep the original, but scalarized. - Value *ScalarDiv = Builder.CreateFDiv(NumEltI, DenEltI); - if (auto *ScalarDivInst = dyn_cast(ScalarDiv)) - ScalarDivInst->copyMetadata(FDiv); - NewElt = ScalarDiv; - } - - NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); - } - } else { // Scalar FDiv. - if (RsqOp) { - NewFDiv = optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, - cast(FPOp), AllowApproxRsq); + SmallVector NumVals; + SmallVector DenVals; + SmallVector RsqDenVals; + extractValues(Builder, NumVals, Num); + extractValues(Builder, DenVals, Den); + + if (RsqOp) + extractValues(Builder, RsqDenVals, RsqOp); + + SmallVector ResultVals(NumVals.size()); + for (int I = 0, E = NumVals.size(); I != E; ++I) { + Value *NumElt = NumVals[I]; + Value *DenElt = DenVals[I]; + Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr; + + Value *NewElt = + visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt, + cast(FPOp), ReqdAccuracy); + if (!NewElt) { + // Keep the original, but scalarized. + + // This has the unfortunate side effect of sometimes scalarizing when + // we're not going to do anything. + NewElt = Builder.CreateFDiv(NumElt, DenElt); + if (auto *NewEltInst = dyn_cast(NewElt)) + NewEltInst->copyMetadata(FDiv); } - if (!NewFDiv) { - // Try rcp first. - if (RcpIsAccurate || AllowInaccurateRcp) { - NewFDiv = - optimizeWithRcp(Builder, Num, Den, DivFMF, cast(FPOp), - AllowInaccurateRcp, RcpIsAccurate); - } - - if (!NewFDiv) { // Try fdiv.fast. - NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, - HasFP32DenormalFlush, Builder, Mod); - } - } + ResultVals[I] = NewElt; } - if (NewFDiv) { - FDiv.replaceAllUsesWith(NewFDiv); - NewFDiv->takeName(&FDiv); + Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals); + + if (NewVal) { + FDiv.replaceAllUsesWith(NewVal); + NewVal->takeName(&FDiv); RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo); } - return !!NewFDiv; + return true; } static bool hasUnsafeFPMath(const Function &F) { Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -772,176 +772,176 @@ ; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP9]]) -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP12]] -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP11]]) -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP13]]) -; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP9]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP9]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP11]] +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP13]], i32 [[TMP12]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP16]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP17]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = sub i32 0, [[TMP19]] -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP18]]) -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP21]], i32 [[TMP20]]) -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP24]]) -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP26]]) -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) -; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP30]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP24]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] -; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP33]]) -; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) -; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP39]]) -; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = sub i32 0, [[TMP42]] -; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP41]]) -; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP44]], i32 [[TMP43]]) -; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP45]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP33]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP33]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = sub i32 0, [[TMP35]] +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP34]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP37]], i32 [[TMP36]]) +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP32]]) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = sub i32 0, [[TMP41]] +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP40]]) +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP43]], i32 [[TMP42]]) +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = insertelement <2 x float> poison, float [[TMP38]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP45]], float [[TMP44]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP46]]) -; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = extractvalue { float, i32 } [[TMP47]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP47]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = sub i32 0, [[TMP49]] -; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP48]]) -; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP50]]) -; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = insertelement <2 x float> poison, float [[TMP52]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP54:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP55:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP54]]) -; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = extractvalue { float, i32 } [[TMP55]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = extractvalue { float, i32 } [[TMP55]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP58:%.*]] = sub i32 0, [[TMP57]] -; IEEE-GOODFREXP-NEXT: [[TMP59:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP56]]) -; IEEE-GOODFREXP-NEXT: [[TMP60:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP59]], i32 [[TMP58]]) -; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP60]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP46]]) +; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = extractvalue { float, i32 } [[TMP48]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP50]] +; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP49]]) +; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP52]], i32 [[TMP51]]) +; IEEE-GOODFREXP-NEXT: [[TMP54:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP47]]) +; IEEE-GOODFREXP-NEXT: [[TMP55:%.*]] = extractvalue { float, i32 } [[TMP54]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = extractvalue { float, i32 } [[TMP54]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = sub i32 0, [[TMP56]] +; IEEE-GOODFREXP-NEXT: [[TMP58:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP55]]) +; IEEE-GOODFREXP-NEXT: [[TMP59:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP58]], i32 [[TMP57]]) +; IEEE-GOODFREXP-NEXT: [[TMP60:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP60]], float [[TMP59]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: ret void ; ; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP9]]) -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP9]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP12]] -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP11]]) -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP14]], i32 [[TMP13]]) -; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP15]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP9]], 0 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP11]] +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP13]], i32 [[TMP12]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP16]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP16]]) -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = sub i32 0, [[TMP19]] -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP18]]) -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP21]], i32 [[TMP20]]) -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP24]]) -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP24]]) -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP26]]) -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) -; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP30]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] -; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP33]]) -; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) -; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP39]]) -; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 -; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP39]]) -; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = sub i32 0, [[TMP42]] -; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP41]]) -; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP44]], i32 [[TMP43]]) -; IEEE-BADFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP45]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP33]], 0 +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = sub i32 0, [[TMP35]] +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP34]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP37]], i32 [[TMP36]]) +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP32]]) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP32]]) +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = sub i32 0, [[TMP41]] +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP40]]) +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP43]], i32 [[TMP42]]) +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = insertelement <2 x float> poison, float [[TMP38]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP45]], float [[TMP44]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP46]]) -; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = extractvalue { float, i32 } [[TMP47]], 0 -; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP46]]) -; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = sub i32 0, [[TMP49]] -; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP48]]) -; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP50]]) -; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = insertelement <2 x float> poison, float [[TMP52]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP54:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP55:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP54]]) -; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = extractvalue { float, i32 } [[TMP55]], 0 -; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP54]]) -; IEEE-BADFREXP-NEXT: [[TMP58:%.*]] = sub i32 0, [[TMP57]] -; IEEE-BADFREXP-NEXT: [[TMP59:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP56]]) -; IEEE-BADFREXP-NEXT: [[TMP60:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP59]], i32 [[TMP58]]) -; IEEE-BADFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP60]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP46]]) +; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0 +; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP46]]) +; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP50]] +; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP49]]) +; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP52]], i32 [[TMP51]]) +; IEEE-BADFREXP-NEXT: [[TMP54:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP47]]) +; IEEE-BADFREXP-NEXT: [[TMP55:%.*]] = extractvalue { float, i32 } [[TMP54]], 0 +; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP47]]) +; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = sub i32 0, [[TMP56]] +; IEEE-BADFREXP-NEXT: [[TMP58:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP55]]) +; IEEE-BADFREXP-NEXT: [[TMP59:%.*]] = call nsz float @llvm.ldexp.f32.i32(float [[TMP58]], i32 [[TMP57]]) +; IEEE-BADFREXP-NEXT: [[TMP60:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP60]], float [[TMP59]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath_flags ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP1]]) -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; DAZ-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP4:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 +; DAZ-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP7:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP6]]) -; DAZ-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP10:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP9]]) -; DAZ-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP9:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i64 0 +; DAZ-NEXT: [[MD_1ULP_NINF:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP9]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP12:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP11]]) -; DAZ-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP15:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; DAZ-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP15]], i64 1 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP13:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; DAZ-NEXT: [[TMP14:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 +; DAZ-NEXT: [[MD_1ULP_NNAN:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP17:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP16]]) -; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 -; DAZ-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP20:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP19]]) -; DAZ-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP20]], i64 1 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP18:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; DAZ-NEXT: [[TMP19:%.*]] = call nsz float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP20:%.*]] = insertelement <2 x float> poison, float [[TMP18]], i64 0 +; DAZ-NEXT: [[MD_1ULP_NSZ:%.*]] = insertelement <2 x float> [[TMP20]], float [[TMP19]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_NSZ]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -963,81 +963,53 @@ define amdgpu_kernel void @fdiv_fpmath_f32_vector(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) { ; IEEE-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_vector ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] -; IEEE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 -; IEEE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[A]], i64 1 -; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = fdiv float [[TMP5]], [[TMP6]] -; IEEE-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 +; IEEE-NEXT: [[NO_MD:%.*]] = fdiv <2 x float> [[A]], [[B]] ; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 -; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP10:%.*]] = fdiv float [[TMP8]], [[TMP9]], !fpmath !1 -; IEEE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 -; IEEE-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[A]], i64 1 -; IEEE-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP14:%.*]] = fdiv float [[TMP12]], [[TMP13]], !fpmath !1 -; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP14]], i64 1 +; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = fdiv <2 x float> [[A]], [[B]], !fpmath !1 ; IEEE-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[A]], i64 0 -; IEEE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP17:%.*]] = fdiv float [[TMP15]], [[TMP16]], !fpmath !2 -; IEEE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 -; IEEE-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[A]], i64 1 -; IEEE-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP21:%.*]] = fdiv float [[TMP19]], [[TMP20]], !fpmath !2 -; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP21]], i64 1 +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[A]], i64 1 +; IEEE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[B]], i64 0 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[B]], i64 1 +; IEEE-NEXT: [[TMP5:%.*]] = fdiv float [[TMP1]], [[TMP3]], !fpmath !2 +; IEEE-NEXT: [[TMP6:%.*]] = fdiv float [[TMP2]], [[TMP4]], !fpmath !2 +; IEEE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 +; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP6]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 0 -; IEEE-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP24:%.*]] = fdiv float [[TMP22]], [[TMP23]], !fpmath !0 -; IEEE-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP24]], i64 0 -; IEEE-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[A]], i64 1 -; IEEE-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP28:%.*]] = fdiv float [[TMP26]], [[TMP27]], !fpmath !0 -; IEEE-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP28]], i64 1 +; IEEE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 +; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[A]], i64 1 +; IEEE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[B]], i64 0 +; IEEE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[B]], i64 1 +; IEEE-NEXT: [[TMP12:%.*]] = fdiv float [[TMP8]], [[TMP10]], !fpmath !0 +; IEEE-NEXT: [[TMP13:%.*]] = fdiv float [[TMP9]], [[TMP11]], !fpmath !0 +; IEEE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; IEEE-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_vector ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[B]], i64 0 -; DAZ-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] -; DAZ-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 -; DAZ-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[A]], i64 1 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = fdiv float [[TMP5]], [[TMP6]] -; DAZ-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP7]], i64 1 +; DAZ-NEXT: [[NO_MD:%.*]] = fdiv <2 x float> [[A]], [[B]] ; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 -; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[B]], i64 0 -; DAZ-NEXT: [[TMP10:%.*]] = fdiv float [[TMP8]], [[TMP9]], !fpmath !1 -; DAZ-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 -; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[A]], i64 1 -; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP14:%.*]] = fdiv float [[TMP12]], [[TMP13]], !fpmath !1 -; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP14]], i64 1 +; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = fdiv <2 x float> [[A]], [[B]], !fpmath !1 ; DAZ-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 -; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[A]], i64 0 -; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[B]], i64 0 -; DAZ-NEXT: [[TMP17:%.*]] = fdiv float [[TMP15]], [[TMP16]], !fpmath !2 -; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 -; DAZ-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[A]], i64 1 -; DAZ-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP21:%.*]] = fdiv float [[TMP19]], [[TMP20]], !fpmath !2 -; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP21]], i64 1 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[A]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[B]], i64 0 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[B]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = fdiv float [[TMP1]], [[TMP3]], !fpmath !2 +; DAZ-NEXT: [[TMP6:%.*]] = fdiv float [[TMP2]], [[TMP4]], !fpmath !2 +; DAZ-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP6]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 -; DAZ-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 0 -; DAZ-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[B]], i64 0 -; DAZ-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP22]], float [[TMP23]]) -; DAZ-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP24]], i64 0 -; DAZ-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[A]], i64 1 -; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP26]], float [[TMP27]]) -; DAZ-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP28]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[A]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[B]], i64 0 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[B]], i64 1 +; DAZ-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP8]], float [[TMP10]]) +; DAZ-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP9]], float [[TMP11]]) +; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; DAZ-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void ; @@ -1055,19 +1027,9 @@ define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath(ptr addrspace(1) %out, <2 x float> %x) { ; CHECK-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_vector_fpmath ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = fdiv float 1.000000e+00, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fdiv float 1.000000e+00, [[TMP4]] -; CHECK-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; CHECK-NEXT: [[NO_MD:%.*]] = fdiv <2 x float> , [[X]] ; CHECK-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = fdiv float 1.000000e+00, [[TMP6]], !fpmath !1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP10:%.*]] = fdiv float 1.000000e+00, [[TMP9]], !fpmath !1 -; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 +; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = fdiv <2 x float> , [[X]], !fpmath !1 ; CHECK-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: [[AFN_NO_MD:%.*]] = fdiv afn <2 x float> , [[X]] ; CHECK-NEXT: store volatile <2 x float> [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 8 @@ -1097,12 +1059,7 @@ define amdgpu_kernel void @rcp_fdiv_f32_fpmath_vector_nonsplat(ptr addrspace(1) %out, <2 x float> %x) { ; CHECK-LABEL: define amdgpu_kernel void @rcp_fdiv_f32_fpmath_vector_nonsplat ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = fdiv float 1.000000e+00, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fdiv float 2.000000e+00, [[TMP4]] -; CHECK-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; CHECK-NEXT: [[NO_MD:%.*]] = fdiv <2 x float> , [[X]] ; CHECK-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: [[AFN_NO_MD:%.*]] = fdiv afn <2 x float> , [[X]] ; CHECK-NEXT: store volatile <2 x float> [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 8 @@ -1150,25 +1107,25 @@ ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { ; IEEE-GOODFREXP-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 ; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] -; IEEE-GOODFREXP-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul arcp float [[TMP1]], [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fmul arcp float [[TMP2]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; IEEE-GOODFREXP-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-GOODFREXP-NEXT: ret void ; @@ -1176,25 +1133,25 @@ ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { ; IEEE-BADFREXP-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 ; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] -; IEEE-BADFREXP-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul arcp float [[TMP1]], [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fmul arcp float [[TMP2]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; IEEE-BADFREXP-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-BADFREXP-NEXT: ret void ; @@ -1202,15 +1159,15 @@ ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> [[X]], float 1.000000e+00, i32 0 ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; DAZ-NEXT: [[TMP4:%.*]] = fmul arcp float [[TMP1]], [[TMP3]] -; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP7]]) -; DAZ-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP6]], [[TMP8]] -; DAZ-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP9]], i64 1 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X_INSERT]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[Y]], i64 0 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP6:%.*]] = fmul arcp float [[TMP1]], [[TMP5]] +; DAZ-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP8:%.*]] = fmul arcp float [[TMP2]], [[TMP7]] +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0 +; DAZ-NEXT: [[ARCP_25ULP:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP8]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[ARCP_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void ; @@ -1823,93 +1780,99 @@ ; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_vector_fpmath ; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; IEEE-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_X_NO_MD]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fdiv contract float 1.000000e+00, [[TMP1]] -; IEEE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[SQRT_X_NO_MD]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv contract float 1.000000e+00, [[TMP4]] -; IEEE-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; IEEE-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> , [[SQRT_X_NO_MD]] ; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP7:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 -; IEEE-NEXT: [[TMP8:%.*]] = select contract i1 [[TMP7]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP9:%.*]] = fmul contract float [[TMP6]], [[TMP8]] -; IEEE-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP9]]) -; IEEE-NEXT: [[TMP11:%.*]] = select contract i1 [[TMP7]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP12:%.*]] = fmul contract float [[TMP10]], [[TMP11]] -; IEEE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; IEEE-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP14]], 0x3810000000000000 -; IEEE-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP14]], [[TMP16]] -; IEEE-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) -; IEEE-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] -; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP20]], i64 1 +; IEEE-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 +; IEEE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP5:%.*]] = fcmp contract olt float [[TMP3]], 0x3810000000000000 +; IEEE-NEXT: [[TMP6:%.*]] = select contract i1 [[TMP5]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP7:%.*]] = fmul contract float [[TMP3]], [[TMP6]] +; IEEE-NEXT: [[TMP8:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP7]]) +; IEEE-NEXT: [[TMP9:%.*]] = select contract i1 [[TMP5]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP8]], [[TMP9]] +; IEEE-NEXT: [[TMP11:%.*]] = fcmp contract olt float [[TMP4]], 0x3810000000000000 +; IEEE-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP13:%.*]] = fmul contract float [[TMP4]], [[TMP12]] +; IEEE-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) +; IEEE-NEXT: [[TMP15:%.*]] = select contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP16:%.*]] = fmul contract float [[TMP14]], [[TMP15]] +; IEEE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 +; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; IEEE-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP21]], 0x3810000000000000 +; IEEE-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 +; IEEE-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; IEEE-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP20]], 0x3810000000000000 ; IEEE-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP22]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP21]], [[TMP23]] +; IEEE-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP20]], [[TMP23]] ; IEEE-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP24]]) ; IEEE-NEXT: [[TMP26:%.*]] = select contract i1 [[TMP22]], float 4.096000e+03, float 1.000000e+00 ; IEEE-NEXT: [[TMP27:%.*]] = fmul contract float [[TMP25]], [[TMP26]] -; IEEE-NEXT: [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 -; IEEE-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; IEEE-NEXT: [[TMP30:%.*]] = fdiv contract float undef, [[TMP29]], !fpmath !2 -; IEEE-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP28]], float [[TMP30]], i64 1 +; IEEE-NEXT: [[TMP28:%.*]] = fdiv contract float undef, [[TMP19]], !fpmath !2 +; IEEE-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 +; IEEE-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP28]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP32:%.*]] = fcmp contract olt float [[TMP31]], 0x3810000000000000 -; IEEE-NEXT: [[TMP33:%.*]] = select contract i1 [[TMP32]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP34:%.*]] = fmul contract float [[TMP31]], [[TMP33]] -; IEEE-NEXT: [[TMP35:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP34]]) -; IEEE-NEXT: [[TMP36:%.*]] = select contract i1 [[TMP32]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP37:%.*]] = fmul contract float [[TMP35]], [[TMP36]] -; IEEE-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 -; IEEE-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP40:%.*]] = fcmp contract olt float [[TMP39]], 0x3810000000000000 +; IEEE-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 +; IEEE-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; IEEE-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; IEEE-NEXT: [[TMP32:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-NEXT: [[TMP33:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-NEXT: [[TMP34:%.*]] = fcmp contract olt float [[TMP32]], 0x3810000000000000 +; IEEE-NEXT: [[TMP35:%.*]] = select contract i1 [[TMP34]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP36:%.*]] = fmul contract float [[TMP32]], [[TMP35]] +; IEEE-NEXT: [[TMP37:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP36]]) +; IEEE-NEXT: [[TMP38:%.*]] = select contract i1 [[TMP34]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP39:%.*]] = fmul contract float [[TMP37]], [[TMP38]] +; IEEE-NEXT: [[TMP40:%.*]] = fcmp contract olt float [[TMP33]], 0x3810000000000000 ; IEEE-NEXT: [[TMP41:%.*]] = select contract i1 [[TMP40]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP39]], [[TMP41]] +; IEEE-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP33]], [[TMP41]] ; IEEE-NEXT: [[TMP43:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP42]]) ; IEEE-NEXT: [[TMP44:%.*]] = select contract i1 [[TMP40]], float 4.096000e+03, float 1.000000e+00 ; IEEE-NEXT: [[TMP45:%.*]] = fmul contract float [[TMP43]], [[TMP44]] -; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP45]], i64 1 +; IEEE-NEXT: [[TMP46:%.*]] = insertelement <2 x float> poison, float [[TMP39]], i64 0 +; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP46]], float [[TMP45]], i64 1 ; IEEE-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_vector_fpmath ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_X_NO_MD]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = fdiv contract float 1.000000e+00, [[TMP1]] -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[SQRT_X_NO_MD]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fdiv contract float 1.000000e+00, [[TMP4]] -; DAZ-NEXT: [[NO_MD:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; DAZ-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> , [[SQRT_X_NO_MD]] ; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) -; DAZ-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 -; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP9]]) -; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP10]], i64 1 +; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP6]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) -; DAZ-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; DAZ-NEXT: [[TMP15:%.*]] = fdiv contract float undef, [[TMP14]], !fpmath !2 -; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP15]], i64 1 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP10]]) +; DAZ-NEXT: [[TMP13:%.*]] = fdiv contract float undef, [[TMP9]], !fpmath !2 +; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP16]]) -; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i64 0 -; DAZ-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP20:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP19]]) -; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP20]], i64 1 +; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 +; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP19:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP20:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP18]]) +; DAZ-NEXT: [[TMP21:%.*]] = insertelement <2 x float> poison, float [[TMP19]], i64 0 +; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP21]], float [[TMP20]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -2242,45 +2205,45 @@ ; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[DENOM:%.*]]) #[[ATTR1]] { ; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] -; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul arcp float [[TMP1]], [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fmul arcp float [[TMP2]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP23]]) -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul arcp float [[TMP20]], [[TMP27]] -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] -; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP33]]) -; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) -; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul arcp float [[TMP30]], [[TMP37]] -; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP24]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = fmul arcp float [[TMP20]], [[TMP29]] +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = fmul arcp float [[TMP21]], [[TMP36]] +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i64 0 +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-GOODFREXP-NEXT: ret void @@ -2288,45 +2251,45 @@ ; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[DENOM:%.*]]) #[[ATTR1]] { ; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP1]], [[TMP8]] -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fmul arcp float [[TMP11]], [[TMP18]] -; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul arcp float [[TMP1]], [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fmul arcp float [[TMP2]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP21]]) -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP23]]) -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul arcp float [[TMP20]], [[TMP27]] -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] -; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP33]]) -; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) -; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul arcp float [[TMP30]], [[TMP37]] -; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = fmul arcp float [[TMP20]], [[TMP29]] +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = fmul arcp float [[TMP21]], [[TMP36]] +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i64 0 +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-BADFREXP-NEXT: ret void @@ -2334,25 +2297,25 @@ ; DAZ-LABEL: define amdgpu_kernel void @multiple_arcp_fdiv_denom_1ulp_vector ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[DENOM:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP3:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; DAZ-NEXT: [[TMP4:%.*]] = fmul arcp float [[TMP1]], [[TMP3]] -; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP7]]) -; DAZ-NEXT: [[TMP9:%.*]] = fmul arcp float [[TMP6]], [[TMP8]] -; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP9]], i64 1 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP6:%.*]] = fmul arcp float [[TMP1]], [[TMP5]] +; DAZ-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP8:%.*]] = fmul arcp float [[TMP2]], [[TMP7]] +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0 +; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP8]], i64 1 ; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP12:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP11]]) -; DAZ-NEXT: [[TMP13:%.*]] = fmul arcp float [[TMP10]], [[TMP12]] -; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 -; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP17:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP16]]) -; DAZ-NEXT: [[TMP18:%.*]] = fmul arcp float [[TMP15]], [[TMP17]] -; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP18]], i64 1 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP14:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP15:%.*]] = fmul arcp float [[TMP10]], [[TMP14]] +; DAZ-NEXT: [[TMP16:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP17:%.*]] = fmul arcp float [[TMP11]], [[TMP16]] +; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0 +; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP17]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void @@ -2431,45 +2394,45 @@ ; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { ; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 ; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fmul arcp contract float [[TMP1]], [[TMP8]] -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fmul arcp contract float [[TMP11]], [[TMP18]] -; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP1]], [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP2]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; IEEE-GOODFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul arcp contract float [[TMP20]], [[TMP27]] -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] -; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP33]]) -; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) -; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul arcp contract float [[TMP30]], [[TMP37]] -; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP24]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = fmul arcp contract float [[TMP20]], [[TMP29]] +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = fmul arcp contract float [[TMP21]], [[TMP36]] +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i64 0 +; IEEE-GOODFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-GOODFREXP-NEXT: ret void @@ -2478,45 +2441,45 @@ ; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { ; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 ; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = sub i32 0, [[TMP5]] -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP7]], i32 [[TMP6]]) -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fmul arcp contract float [[TMP1]], [[TMP8]] -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 0, [[TMP15]] -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP16]]) -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fmul arcp contract float [[TMP11]], [[TMP18]] -; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP1]], [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP2]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; IEEE-BADFREXP-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP21]]) -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP21]]) -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = sub i32 0, [[TMP24]] -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP25]]) -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul arcp contract float [[TMP20]], [[TMP27]] -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = sub i32 0, [[TMP34]] -; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP33]]) -; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP36]], i32 [[TMP35]]) -; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul arcp contract float [[TMP30]], [[TMP37]] -; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP38]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP24]], 0 +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 0, [[TMP26]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP25]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = fmul arcp contract float [[TMP20]], [[TMP29]] +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = sub i32 0, [[TMP33]] +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP34]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = fmul arcp contract float [[TMP21]], [[TMP36]] +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i64 0 +; IEEE-BADFREXP-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; IEEE-BADFREXP-NEXT: ret void @@ -2525,25 +2488,25 @@ ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[SQR_DENOM:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[SQR_DENOM]]), !fpmath !3 ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP3:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; DAZ-NEXT: [[TMP4:%.*]] = fmul arcp contract float [[TMP1]], [[TMP3]] -; DAZ-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0 -; DAZ-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP7]]) -; DAZ-NEXT: [[TMP9:%.*]] = fmul arcp contract float [[TMP6]], [[TMP8]] -; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP9]], i64 1 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP6:%.*]] = fmul arcp contract float [[TMP1]], [[TMP5]] +; DAZ-NEXT: [[TMP7:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP8:%.*]] = fmul arcp contract float [[TMP2]], [[TMP7]] +; DAZ-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0 +; DAZ-NEXT: [[ARCP0:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP8]], i64 1 ; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[Y]], i64 0 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP11]]) -; DAZ-NEXT: [[TMP13:%.*]] = fmul arcp contract float [[TMP10]], [[TMP12]] -; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i64 0 -; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[Y]], i64 1 -; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP17:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP16]]) -; DAZ-NEXT: [[TMP18:%.*]] = fmul arcp contract float [[TMP15]], [[TMP17]] -; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP18]], i64 1 +; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[Y]], i64 1 +; DAZ-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP14:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) +; DAZ-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP10]], [[TMP14]] +; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP17:%.*]] = fmul arcp contract float [[TMP11]], [[TMP16]] +; DAZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0 +; DAZ-NEXT: [[ARCP1:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP17]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[ARCP0]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: store volatile <2 x float> [[ARCP1]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void @@ -2643,46 +2606,54 @@ ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = fcmp contract olt float [[TMP1]], 0x3810000000000000 -; IEEE-NEXT: [[TMP3:%.*]] = select contract i1 [[TMP2]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP4:%.*]] = fmul contract float [[TMP1]], [[TMP3]] -; IEEE-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) -; IEEE-NEXT: [[TMP6:%.*]] = select contract i1 [[TMP2]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP7:%.*]] = fmul contract float [[TMP5]], [[TMP6]] -; IEEE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 -; IEEE-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-NEXT: [[TMP10:%.*]] = fcmp contract olt float [[TMP9]], 0x3810000000000000 -; IEEE-NEXT: [[TMP11:%.*]] = select contract i1 [[TMP10]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP12:%.*]] = fmul contract float [[TMP9]], [[TMP11]] -; IEEE-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP12]]) -; IEEE-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP10]], float -4.096000e+03, float -1.000000e+00 -; IEEE-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP14]] -; IEEE-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP15]], i64 1 -; IEEE-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-NEXT: [[TMP18:%.*]] = fdiv contract float 4.000000e+00, [[TMP17]], !fpmath !2 -; IEEE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP18]], i64 2 -; IEEE-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-NEXT: [[TMP21:%.*]] = fdiv contract float undef, [[TMP20]], !fpmath !2 -; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP21]], i64 3 +; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 +; IEEE-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP10]] +; IEEE-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) +; IEEE-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 +; IEEE-NEXT: [[TMP14:%.*]] = fmul contract float [[TMP12]], [[TMP13]] +; IEEE-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP6]], [[TMP16]] +; IEEE-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; IEEE-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 +; IEEE-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] +; IEEE-NEXT: [[TMP21:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 +; IEEE-NEXT: [[TMP22:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 +; IEEE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 +; IEEE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i64 1 +; IEEE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP21]], i64 2 +; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP22]], i64 3 ; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP1]]) -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) -; DAZ-NEXT: [[TMP6:%.*]] = fneg contract float [[TMP5]] -; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]], !fpmath !2 -; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]], !fpmath !2 -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP5]]) +; DAZ-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP10]] +; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 +; DAZ-NEXT: [[TMP13:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 +; DAZ-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; DAZ-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i64 1 +; DAZ-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP12]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP13]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -2694,19 +2665,23 @@ ; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = fneg contract afn float [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]], !fpmath !2 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]], !fpmath !2 -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; CHECK-NEXT: [[TMP9:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP6]]) +; CHECK-NEXT: [[TMP11:%.*]] = fneg contract afn float [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 +; CHECK-NEXT: [[TMP13:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i64 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP12]], i64 2 +; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP13]], i64 3 ; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg) @@ -2730,18 +2705,7 @@ ; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = fdiv contract float 1.000000e+00, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fdiv contract float -1.000000e+00, [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = fdiv contract float 4.000000e+00, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; CHECK-NEXT: [[TMP11:%.*]] = fdiv contract float undef, [[TMP10]] -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 +; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> , [[DENOM]] ; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -2754,74 +2718,74 @@ ; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) ; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fneg contract float [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP11]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float 4.000000e+00, [[TMP18]], !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP19]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = fdiv contract float undef, [[TMP21]], !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP22]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt ; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) ; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fneg contract float [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float 4.000000e+00, [[TMP18]], !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP19]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = fdiv contract float undef, [[TMP21]], !fpmath !2 -; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP22]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fneg contract float [[TMP4]] -; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP5]]) -; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float 4.000000e+00, [[TMP8]], !fpmath !2 -; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP9]], i64 2 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float undef, [[TMP11]], !fpmath !2 -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP12]], i64 3 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP6:%.*]] = fneg contract float [[TMP2]] +; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP8:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 +; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 +; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 1 +; DAZ-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP9]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg) @@ -2833,99 +2797,111 @@ ; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp ; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = fcmp arcp contract olt float [[TMP1]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = select arcp contract i1 [[TMP2]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = fmul arcp contract float [[TMP1]], [[TMP3]] -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = select arcp contract i1 [[TMP2]], float 4.096000e+03, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = fmul arcp contract float [[TMP5]], [[TMP6]] -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fcmp arcp contract olt float [[TMP9]], 0x3810000000000000 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = select arcp contract i1 [[TMP10]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = fmul arcp contract float [[TMP9]], [[TMP11]] -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP12]]) -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = select arcp contract i1 [[TMP10]], float -4.096000e+03, float -1.000000e+00 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP13]], [[TMP14]] -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP15]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP23]] -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP24]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP26]]) -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP27]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = sub i32 0, [[TMP29]] -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP28]]) -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP31]], i32 [[TMP30]]) -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = fmul arcp contract float undef, [[TMP32]] -; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP33]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp arcp contract olt float [[TMP5]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = select arcp contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP5]], [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select arcp contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = fmul arcp contract float [[TMP12]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fcmp arcp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = select arcp contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul arcp contract float [[TMP6]], [[TMP16]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = select arcp contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = fmul arcp contract float [[TMP18]], [[TMP19]] +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP21]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = sub i32 0, [[TMP23]] +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP24]]) +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP26]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp ; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = fcmp arcp contract olt float [[TMP1]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = select arcp contract i1 [[TMP2]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = fmul arcp contract float [[TMP1]], [[TMP3]] -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = select arcp contract i1 [[TMP2]], float 4.096000e+03, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = fmul arcp contract float [[TMP5]], [[TMP6]] -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fcmp arcp contract olt float [[TMP9]], 0x3810000000000000 -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = select arcp contract i1 [[TMP10]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = fmul arcp contract float [[TMP9]], [[TMP11]] -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = select arcp contract i1 [[TMP10]], float -4.096000e+03, float -1.000000e+00 -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fmul arcp contract float [[TMP13]], [[TMP14]] -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP15]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP17]]) -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP17]]) -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP23]] -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP24]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP26]]) -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP26]]) -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = sub i32 0, [[TMP29]] -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP28]]) -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP31]], i32 [[TMP30]]) -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = fmul arcp contract float undef, [[TMP32]] -; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP33]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp arcp contract olt float [[TMP5]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = select arcp contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul arcp contract float [[TMP5]], [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select arcp contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = fmul arcp contract float [[TMP12]], [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fcmp arcp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = select arcp contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul arcp contract float [[TMP6]], [[TMP16]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = select arcp contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = fmul arcp contract float [[TMP18]], [[TMP19]] +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = sub i32 0, [[TMP23]] +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP24]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP26]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]] +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP1]]) -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP4]]) -; DAZ-NEXT: [[TMP6:%.*]] = fneg arcp contract float [[TMP5]] -; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; DAZ-NEXT: [[TMP10:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP9]] -; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP10]], i64 2 -; DAZ-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP13:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; DAZ-NEXT: [[TMP14:%.*]] = fmul arcp contract float undef, [[TMP13]] -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP14]], i64 3 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP9:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP5]]) +; DAZ-NEXT: [[TMP10:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP11:%.*]] = fneg arcp contract float [[TMP10]] +; DAZ-NEXT: [[TMP12:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP13:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP12]] +; DAZ-NEXT: [[TMP14:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP15:%.*]] = fmul arcp contract float undef, [[TMP14]] +; DAZ-NEXT: [[TMP16:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; DAZ-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP11]], i64 1 +; DAZ-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP13]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP15]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -2937,18 +2913,7 @@ ; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = fdiv arcp contract float 1.000000e+00, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fdiv arcp contract float -1.000000e+00, [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = fdiv arcp contract float 4.000000e+00, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; CHECK-NEXT: [[TMP11:%.*]] = fdiv arcp contract float undef, [[TMP10]] -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 +; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> , [[DENOM]] ; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -2960,98 +2925,98 @@ ; IEEE-GOODFREXP-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp ; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP2]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fneg arcp float [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP11]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] -; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP20]]) -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP22]]) -; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = fmul arcp float 4.000000e+00, [[TMP24]] -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP25]], i64 2 -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP27]]) -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] -; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) -; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) -; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = fmul arcp float undef, [[TMP33]] -; IEEE-GOODFREXP-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP34]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fneg arcp float [[TMP2]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fmul arcp float 4.000000e+00, [[TMP23]] +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = fmul arcp float undef, [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP17]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP24]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RCP]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp ; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP2]], 0 -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP4]] -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP6]], i32 [[TMP5]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fneg arcp float [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP11]], 0 -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP13]] -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP14]]) -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP16]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]]) -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] -; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP20]]) -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP23]], i32 [[TMP22]]) -; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = fmul arcp float 4.000000e+00, [[TMP24]] -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP25]], i64 2 -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP27]]) -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP27]]) -; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] -; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) -; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) -; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = fmul arcp float undef, [[TMP33]] -; IEEE-BADFREXP-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP34]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fneg arcp float [[TMP2]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = sub i32 0, [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fmul arcp float 4.000000e+00, [[TMP23]] +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = fmul arcp float undef, [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP17]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP24]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RCP]] ; ; DAZ-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP1]]) -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fneg arcp float [[TMP4]] -; DAZ-NEXT: [[TMP6:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP5]]) -; DAZ-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP6]], i64 1 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; DAZ-NEXT: [[TMP9:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; DAZ-NEXT: [[TMP10:%.*]] = fmul arcp float 4.000000e+00, [[TMP9]] -; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP10]], i64 2 -; DAZ-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; DAZ-NEXT: [[TMP13:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; DAZ-NEXT: [[TMP14:%.*]] = fmul arcp float undef, [[TMP13]] -; DAZ-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP14]], i64 3 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; DAZ-NEXT: [[TMP6:%.*]] = fneg arcp float [[TMP2]] +; DAZ-NEXT: [[TMP7:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP8:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; DAZ-NEXT: [[TMP9:%.*]] = fmul arcp float 4.000000e+00, [[TMP8]] +; DAZ-NEXT: [[TMP10:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP11:%.*]] = fmul arcp float undef, [[TMP10]] +; DAZ-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP7]], i64 1 +; DAZ-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP9]], i64 2 +; DAZ-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RCP]] ; %partial.rcp = fdiv arcp <4 x float> , %arg, !fpmath !2 @@ -3061,18 +3026,7 @@ define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp_correct(<4 x float> %arg) { ; CHECK-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp_correct ; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = fdiv arcp float 1.000000e+00, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fdiv arcp float -1.000000e+00, [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = fdiv arcp float 4.000000e+00, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP8]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; CHECK-NEXT: [[TMP11:%.*]] = fdiv arcp float undef, [[TMP10]] -; CHECK-NEXT: [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i64 3 +; CHECK-NEXT: [[PARTIAL_RCP:%.*]] = fdiv arcp <4 x float> , [[ARG]] ; CHECK-NEXT: ret <4 x float> [[PARTIAL_RCP]] ; %partial.rcp = fdiv arcp <4 x float> , %arg @@ -3084,17 +3038,19 @@ ; CHECK-LABEL: define <4 x float> @rsq_f32_vector_const_denom ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 -; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) -; CHECK-NEXT: [[TMP4:%.*]] = fneg contract float [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 -; CHECK-NEXT: [[TMP7:%.*]] = fdiv contract float undef, [[TMP6]], !fpmath !2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP7]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = fdiv contract float 2.000000e+00, [[TMP9]], !fpmath !2 -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP10]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; CHECK-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; CHECK-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fdiv contract float undef, [[TMP3]], !fpmath !2 +; CHECK-NEXT: [[TMP9:%.*]] = fdiv contract float 2.000000e+00, [[TMP4]], !fpmath !2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 2 +; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP9]], i64 3 ; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 @@ -3111,14 +3067,14 @@ ; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] ; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) ; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float -2.000000e+00) -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP9]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP13]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float -2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP12]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float 0x7FF8000000000000, i64 2 ; IEEE-GOODFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP15]], float 0x3FC99999A0000000, i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] @@ -3131,14 +3087,14 @@ ; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] ; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) ; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float -2.000000e+00) -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float -2.000000e+00) -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP10]] -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP9]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP11]]) -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP13]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float -2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float -2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP12]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float 0x7FF8000000000000, i64 2 ; IEEE-BADFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP15]], float 0x3FC99999A0000000, i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] @@ -3146,9 +3102,9 @@ ; DAZ-LABEL: define <4 x float> @fdiv_constant_f32_vector ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.rcp.f32(float 5.000000e-01) -; DAZ-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; DAZ-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.rcp.f32(float -2.000000e+00) -; DAZ-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i64 1 +; DAZ-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.rcp.f32(float -2.000000e+00) +; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; DAZ-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP2]], i64 1 ; DAZ-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0x7FF8000000000000, i64 2 ; DAZ-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP5]], float 0x3FC99999A0000000, i64 3 ; DAZ-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -999,11 +999,11 @@ ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[8:9], s6, s6, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[8:9], s7, s7, v1 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s5, v0, s5 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1012,23 +1012,23 @@ ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[8:9], s7, s7, v2 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[8:9], s6, s6, v2 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v1 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, vcc, s5, v1, s5 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s7, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v1, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v5, v1 +; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v0, v4 +; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v5, v0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v3, v5, v1 +; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v4, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v1, s7, v2 +; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v2 ; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-FASTFMA-NEXT: s_endpgm ; @@ -1037,11 +1037,11 @@ ; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[2:3], s6, s6, v0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s4, v2, s4 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1051,24 +1051,24 @@ ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[2:3], s7, s7, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s5, v3, s5 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s7, v0 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v2, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, v1, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v1 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5 +; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v1, v5 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v0, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v2, v1, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s7, v4 +; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v4 ; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; @@ -1079,11 +1079,11 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_div_scale_f32 v2, s[8:9], s6, s6, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_div_scale_f32 v2, s[8:9], s7, s7, v1 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s5, v0, s5 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1092,23 +1092,23 @@ ; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX7-NEXT: v_div_scale_f32 v3, s[8:9], s7, s7, v2 +; GFX7-NEXT: v_div_scale_f32 v3, s[8:9], s6, s6, v2 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_fixup_f32 v0, v0, s6, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_div_scale_f32 v1, vcc, s5, v1, s5 +; GFX7-NEXT: v_div_fixup_f32 v1, v0, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, v1, v4 -; GFX7-NEXT: v_fma_f32 v6, -v3, v5, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, v0, v4 +; GFX7-NEXT: v_fma_f32 v6, -v3, v5, v0 ; GFX7-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX7-NEXT: v_fma_f32 v1, -v3, v5, v1 +; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v1, v1, v4, v5 -; GFX7-NEXT: v_div_fixup_f32 v1, v1, s7, v2 +; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5 +; GFX7-NEXT: v_div_fixup_f32 v0, v0, s6, v2 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1117,11 +1117,11 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s6, s6, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s4, v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1131,24 +1131,24 @@ ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s7, s7, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, s5, v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-NEXT: v_div_fixup_f32 v0, v1, s6, v0 +; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v1, -v2, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v1, v1, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v1 +; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0 +; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5 +; GFX8-NEXT: v_mul_f32_e32 v5, v3, v0 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v1, v5 +; GFX8-NEXT: v_fma_f32 v5, v6, v0, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v1, v2, v1, v5 +; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v4 +; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1156,8 +1156,8 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s6, s6, s4 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s4, s6, s4 +; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 @@ -1168,22 +1168,22 @@ ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s2, s7, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, s5, s7, s5 -; GFX10-NEXT: v_rcp_f32_e32 v1, v2 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s4 +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v4, -v2, v1, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v1, v4, v1 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 -; GFX10-NEXT: v_fma_f32 v5, -v2, v4, v3 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v1 -; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v3 +; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v4, v0, v3 +; GFX10-NEXT: v_fma_f32 v5, -v2, v4, v0 +; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX10-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v1, v2, v1, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_div_fixup_f32 v1, v1, s7, s5 +; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -1194,8 +1194,8 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s6, s6, s4 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s4, s6, s4 +; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1206,23 +1206,23 @@ ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v2, null, s7, s7, s5 +; GFX11-NEXT: v_div_scale_f32 v2, null, s6, s6, s4 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, s5, s7, s5 -; GFX11-NEXT: v_rcp_f32_e32 v1, v2 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s6, s4 +; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_div_fixup_f32 v1, v0, s7, s5 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v1, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v1, v4, v1 -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1 -; GFX11-NEXT: v_fma_f32 v5, -v2, v4, v3 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v1 -; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v3 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v1, v2, v1, v4 +; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_mul_f32_e32 v4, v0, v3 +; GFX11-NEXT: v_fma_f32 v5, -v2, v4, v0 +; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX11-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_div_fixup_f32 v1, v1, s7, s5 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s6, s4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1411,10 +1411,10 @@ ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s6 -; GFX67-NEXT: v_rcp_f32_e32 v1, s7 -; GFX67-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX67-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX67-NEXT: v_rcp_f32_e32 v0, s7 +; GFX67-NEXT: v_rcp_f32_e32 v2, s6 +; GFX67-NEXT: v_mul_f32_e32 v1, s5, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX67-NEXT: s_endpgm ; @@ -1423,12 +1423,12 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s6 -; GFX8-NEXT: v_rcp_f32_e32 v1, s7 +; GFX8-NEXT: v_rcp_f32_e32 v0, s7 +; GFX8-NEXT: v_rcp_f32_e32 v2, s6 +; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s5, v1 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1437,13 +1437,13 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s6 -; GFX10-NEXT: v_rcp_f32_e32 v1, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_rcp_f32_e32 v2, s6 +; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: @@ -1452,12 +1452,12 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s6 -; GFX11-NEXT: v_rcp_f32_e32 v1, s7 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 +; GFX11-NEXT: v_rcp_f32_e32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1490,11 +1490,11 @@ ; GFX6-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s10, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[12:13], s4, s4, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[12:13], s7, s7, v1 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s0, v0, s0 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1503,53 +1503,53 @@ ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[12:13], s5, s5, v2 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v1 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, vcc, s1, v1, s1 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v3, v0, s7, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[12:13], s6, s6, v1 +; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v1, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v5, v1 +; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v0, v4 +; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v3, v5, v1 +; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v5, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v4, v5 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, s[0:1], s6, s6, v3 +; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v2, v0, s6, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, s[2:3], s5, s5, v1 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v2 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s1, v0, s1 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v2, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v4, v6, v2 +; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v0, v5 +; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v4, v6, v0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v4, v6, v2 +; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v4, v6, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v4, s3 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v5, v6 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, s[0:1], s7, s7, v4 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v5, v6 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, s[2:3], s4, s4, v4 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v6, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v2, v2, s6, v3 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s3, v3, s3 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s5, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s0, v0, s0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v6, v6 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX6-FASTFMA-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v7, v0, v6 +; GFX6-FASTFMA-NEXT: v_fma_f32 v8, -v5, v7, v0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v7, v8, v6, v7 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v5, v7, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v3, v3, v6, v7 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v3, v3, s7, v4 +; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v6, v7 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v4 ; GFX6-FASTFMA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-FASTFMA-NEXT: s_endpgm ; @@ -1559,11 +1559,13 @@ ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[10:11], s4, s4, v0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s0, v2, s0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[10:11], s7, s7, v0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v7, s1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v8, s0 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1573,54 +1575,52 @@ ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[10:11], s5, s5, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[10:11], s6, s6, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s1, v3, s1 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v6, s2 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v5, vcc, s2, v3, s2 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s10, -1 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s4, v0 +; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v6, v2 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v3, v1, s7, v0 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v2, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, v1, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v1 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v7, v1, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v6, 1.0 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v6, v6 +; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v1, v5, v0 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v1, v5 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, v6, v0, v1 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v1, v5 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[0:1], s6, s6, v6 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v2, v1, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v7, s3 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v5, s[2:3], s5, s5, v7 +; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, s1, v1, s1 +; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v2, v0, s6, v4 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v3, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v4, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v8, -v3, v5, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v8, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v5, v6, 1.0 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v6, v6 +; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v1, v0 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v5, v4, v1 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v6, v0, v4 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v5, v4, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[0:1], s7, s7, v7 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, s3, v4, s3 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v2, v2, s6, v6 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v5, s[2:3], s4, s4, v8 +; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v1, v0, v4 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, s0, v1, s0 +; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v0, s5, v7 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v8, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v8, v5, v6 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v5, v6, 1.0 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v6, v6 +; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v0 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v5, v6, v4 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v7, v0, v6 +; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v5, v6, v4 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v3, v3, s7, v7 +; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v4, v0, v6 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v8 ; GFX6-SLOWFMA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; @@ -1632,11 +1632,11 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_div_scale_f32 v2, s[12:13], s4, s4, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_div_scale_f32 v2, s[12:13], s7, s7, v1 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s0, v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1645,53 +1645,53 @@ ; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX7-NEXT: v_div_scale_f32 v3, s[12:13], s5, s5, v2 -; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_fixup_f32 v0, v0, s4, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f32 v1, vcc, s1, v1, s1 +; GFX7-NEXT: v_div_fixup_f32 v3, v0, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_div_scale_f32 v2, s[12:13], s6, s6, v1 +; GFX7-NEXT: v_rcp_f32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX7-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, v1, v4 -; GFX7-NEXT: v_fma_f32 v6, -v3, v5, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, v0, v4 +; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v0 ; GFX7-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX7-NEXT: v_fma_f32 v1, -v3, v5, v1 +; GFX7-NEXT: v_fma_f32 v0, -v2, v5, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 -; GFX7-NEXT: v_div_fmas_f32 v1, v1, v4, v5 -; GFX7-NEXT: v_div_scale_f32 v4, s[0:1], s6, s6, v3 +; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5 +; GFX7-NEXT: v_div_fixup_f32 v2, v0, s6, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_div_scale_f32 v4, s[2:3], s5, s5, v1 ; GFX7-NEXT: v_rcp_f32_e32 v5, v4 -; GFX7-NEXT: v_div_fixup_f32 v1, v1, s5, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s1, v0, s1 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; GFX7-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, v2, v5 -; GFX7-NEXT: v_fma_f32 v7, -v4, v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, v0, v5 +; GFX7-NEXT: v_fma_f32 v7, -v4, v6, v0 ; GFX7-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX7-NEXT: v_fma_f32 v2, -v4, v6, v2 +; GFX7-NEXT: v_fma_f32 v0, -v4, v6, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v5, v6 -; GFX7-NEXT: v_div_scale_f32 v5, s[0:1], s7, s7, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_div_fmas_f32 v0, v0, v5, v6 +; GFX7-NEXT: v_div_scale_f32 v5, s[2:3], s4, s4, v4 ; GFX7-NEXT: v_rcp_f32_e32 v6, v5 -; GFX7-NEXT: v_div_fixup_f32 v2, v2, s6, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_div_scale_f32 v3, vcc, s3, v3, s3 +; GFX7-NEXT: v_div_fixup_f32 v1, v0, s5, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s0, v0, s0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; GFX7-NEXT: v_fma_f32 v6, v7, v6, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX7-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, v0, v6 +; GFX7-NEXT: v_fma_f32 v8, -v5, v7, v0 ; GFX7-NEXT: v_fma_f32 v7, v8, v6, v7 -; GFX7-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX7-NEXT: v_fma_f32 v0, -v5, v7, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v3, v3, v6, v7 -; GFX7-NEXT: v_div_fixup_f32 v3, v3, s7, v4 +; GFX7-NEXT: v_div_fmas_f32 v0, v0, v6, v7 +; GFX7-NEXT: v_div_fixup_f32 v0, v0, s4, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; @@ -1701,11 +1701,13 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_div_scale_f32 v1, s[10:11], s4, s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_div_scale_f32 v1, s[10:11], s7, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1715,54 +1717,52 @@ ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[10:11], s5, s5, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[10:11], s6, s6, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, s1, v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-NEXT: v_div_fixup_f32 v0, v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_div_scale_f32 v5, vcc, s2, v3, s2 +; GFX8-NEXT: v_rcp_f32_e32 v6, v2 +; GFX8-NEXT: v_div_fixup_f32 v3, v1, s7, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v1, -v2, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v1, v1, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v1 -; GFX8-NEXT: v_fma_f32 v7, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v7, v1, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: v_fma_f32 v0, -v2, v6, 1.0 +; GFX8-NEXT: v_fma_f32 v0, v0, v6, v6 +; GFX8-NEXT: v_mul_f32_e32 v1, v5, v0 +; GFX8-NEXT: v_fma_f32 v6, -v2, v1, v5 +; GFX8-NEXT: v_fma_f32 v1, v6, v0, v1 +; GFX8-NEXT: v_fma_f32 v2, -v2, v1, v5 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v3, s[0:1], s6, s6, v6 -; GFX8-NEXT: v_div_fmas_f32 v1, v2, v1, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-NEXT: v_div_fixup_f32 v1, v1, s5, v4 +; GFX8-NEXT: v_div_scale_f32 v5, s[2:3], s5, s5, v7 +; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_div_scale_f32 v1, vcc, s1, v1, s1 +; GFX8-NEXT: v_rcp_f32_e32 v6, v5 +; GFX8-NEXT: v_div_fixup_f32 v2, v0, s6, v4 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v4, -v3, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v4, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX8-NEXT: v_fma_f32 v8, -v3, v5, v2 -; GFX8-NEXT: v_fma_f32 v5, v8, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX8-NEXT: v_fma_f32 v0, -v5, v6, 1.0 +; GFX8-NEXT: v_fma_f32 v0, v0, v6, v6 +; GFX8-NEXT: v_mul_f32_e32 v4, v1, v0 +; GFX8-NEXT: v_fma_f32 v6, -v5, v4, v1 +; GFX8-NEXT: v_fma_f32 v4, v6, v0, v4 +; GFX8-NEXT: v_fma_f32 v1, -v5, v4, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v3, s[0:1], s7, s7, v7 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_div_scale_f32 v4, vcc, s3, v4, s3 -; GFX8-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-NEXT: v_div_fixup_f32 v2, v2, s6, v6 +; GFX8-NEXT: v_div_scale_f32 v5, s[2:3], s4, s4, v8 +; GFX8-NEXT: v_div_fmas_f32 v0, v1, v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_div_scale_f32 v4, vcc, s0, v1, s0 +; GFX8-NEXT: v_rcp_f32_e32 v6, v5 +; GFX8-NEXT: v_div_fixup_f32 v1, v0, s5, v7 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v8, -v3, v6, v4 -; GFX8-NEXT: v_fma_f32 v6, v8, v5, v6 -; GFX8-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX8-NEXT: v_fma_f32 v0, -v5, v6, 1.0 +; GFX8-NEXT: v_fma_f32 v0, v0, v6, v6 +; GFX8-NEXT: v_mul_f32_e32 v6, v4, v0 +; GFX8-NEXT: v_fma_f32 v7, -v5, v6, v4 +; GFX8-NEXT: v_fma_f32 v6, v7, v0, v6 +; GFX8-NEXT: v_fma_f32 v4, -v5, v6, v4 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX8-NEXT: v_div_fmas_f32 v0, v4, v0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_div_fixup_f32 v3, v3, s7, v7 +; GFX8-NEXT: v_div_fixup_f32 v0, v0, s4, v8 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1772,8 +1772,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s10, s4, s4, s0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s0, s4, s0 +; GFX10-NEXT: v_div_scale_f32 v0, s10, s7, s7, s3 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1783,48 +1783,48 @@ ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s10, s5, s5, s1 +; GFX10-NEXT: v_div_scale_f32 v2, s10, s6, s6, s2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: v_div_scale_f32 v1, vcc_lo, s1, s5, s1 +; GFX10-NEXT: v_div_scale_f32 v1, vcc_lo, s2, s6, s2 ; GFX10-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, s0 +; GFX10-NEXT: v_div_fixup_f32 v3, v0, s7, s3 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v3, -v2, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v3, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v1, v4 -; GFX10-NEXT: v_fma_f32 v5, -v2, v3, v1 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX10-NEXT: v_fma_f32 v1, -v2, v3, v1 +; GFX10-NEXT: v_fma_f32 v0, -v2, v4, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v4 +; GFX10-NEXT: v_mul_f32_e32 v0, v1, v4 +; GFX10-NEXT: v_fma_f32 v5, -v2, v0, v1 +; GFX10-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX10-NEXT: v_fma_f32 v1, -v2, v0, v1 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s0, s6, s6, s2 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, s2, s6, s2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v2 -; GFX10-NEXT: v_div_fixup_f32 v1, v1, s5, s1 +; GFX10-NEXT: v_div_scale_f32 v5, s3, s5, s5, s1 +; GFX10-NEXT: v_div_fmas_f32 v0, v1, v4, v0 +; GFX10-NEXT: v_div_scale_f32 v1, vcc_lo, s1, s5, s1 +; GFX10-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-NEXT: v_div_fixup_f32 v2, v0, s6, s2 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v4, -v2, v5, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v5, v4, v5 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v5 -; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v3 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v3 +; GFX10-NEXT: v_fma_f32 v0, -v5, v6, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v6, v0, v6 +; GFX10-NEXT: v_mul_f32_e32 v0, v1, v6 +; GFX10-NEXT: v_fma_f32 v4, -v5, v0, v1 +; GFX10-NEXT: v_fmac_f32_e32 v0, v4, v6 +; GFX10-NEXT: v_fma_f32 v1, -v5, v0, v1 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v3, s0, s7, s7, s3 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v5, v4 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, s3, s7, s3 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_div_fixup_f32 v2, v2, s6, s2 +; GFX10-NEXT: v_div_scale_f32 v4, s2, s4, s4, s0 +; GFX10-NEXT: v_div_fmas_f32 v0, v1, v6, v0 +; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: v_div_fixup_f32 v1, v0, s5, s1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s4, s0 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-NEXT: v_mul_f32_e32 v6, v0, v5 +; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v0 +; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX10-NEXT: v_fma_f32 v0, -v4, v6, v0 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_div_fixup_f32 v3, v3, s7, s3 +; GFX10-NEXT: v_div_fmas_f32 v0, v0, v5, v6 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, s0 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX10-NEXT: s_endpgm ; @@ -1834,8 +1834,8 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, s0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s0, s4, s0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s3 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1846,51 +1846,51 @@ ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v2, null, s5, s5, s1 +; GFX11-NEXT: v_div_scale_f32 v2, null, s6, s6, s2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX11-NEXT: v_div_scale_f32 v1, vcc_lo, s1, s5, s1 +; GFX11-NEXT: v_div_scale_f32 v1, vcc_lo, s2, s6, s2 ; GFX11-NEXT: v_rcp_f32_e32 v4, v2 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s0 +; GFX11-NEXT: v_div_fixup_f32 v3, v0, s7, s3 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v2, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v3, v4 -; GFX11-NEXT: v_mul_f32_e32 v3, v1, v4 -; GFX11-NEXT: v_fma_f32 v5, -v2, v3, v1 -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX11-NEXT: v_fma_f32 v1, -v2, v3, v1 +; GFX11-NEXT: v_fma_f32 v0, -v2, v4, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v4 +; GFX11-NEXT: v_mul_f32_e32 v0, v1, v4 +; GFX11-NEXT: v_fma_f32 v5, -v2, v0, v1 +; GFX11-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX11-NEXT: v_fma_f32 v1, -v2, v0, v1 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v2, null, s6, s6, s2 -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, s2, s6, s2 -; GFX11-NEXT: v_rcp_f32_e32 v5, v2 -; GFX11-NEXT: v_div_fixup_f32 v1, v1, s5, s1 +; GFX11-NEXT: v_div_scale_f32 v5, null, s5, s5, s1 +; GFX11-NEXT: v_div_fmas_f32 v0, v1, v4, v0 +; GFX11-NEXT: v_div_scale_f32 v1, vcc_lo, s1, s5, s1 +; GFX11-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-NEXT: v_div_fixup_f32 v2, v0, s6, s2 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v5, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v5, v4, v5 -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v5 -; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v3 -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v5 -; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v3 +; GFX11-NEXT: v_fma_f32 v0, -v5, v6, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v6, v0, v6 +; GFX11-NEXT: v_mul_f32_e32 v0, v1, v6 +; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v1 +; GFX11-NEXT: v_fmac_f32_e32 v0, v4, v6 +; GFX11-NEXT: v_fma_f32 v1, -v5, v0, v1 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v3, null, s7, s7, s3 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v5, v4 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, s3, s7, s3 -; GFX11-NEXT: v_rcp_f32_e32 v4, v3 -; GFX11-NEXT: v_div_fixup_f32 v2, v2, s6, s2 +; GFX11-NEXT: v_div_scale_f32 v4, null, s4, s4, s0 +; GFX11-NEXT: v_div_fmas_f32 v0, v1, v6, v0 +; GFX11-NEXT: v_rcp_f32_e32 v5, v4 +; GFX11-NEXT: v_div_fixup_f32 v1, v0, s5, s1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s4, s0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-NEXT: v_mul_f32_e32 v6, v0, v5 +; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v0 +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX11-NEXT: v_fma_f32 v0, -v4, v6, v0 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_div_fixup_f32 v3, v3, s7, s3 +; GFX11-NEXT: v_div_fmas_f32 v0, v0, v5, v6 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s0 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2045,14 +2045,14 @@ ; GFX67-NEXT: s_mov_b32 s11, 0xf000 ; GFX67-NEXT: s_mov_b32 s10, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s4 -; GFX67-NEXT: v_rcp_f32_e32 v1, s5 -; GFX67-NEXT: v_rcp_f32_e32 v2, s6 -; GFX67-NEXT: v_rcp_f32_e32 v3, s7 -; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1 -; GFX67-NEXT: v_mul_f32_e32 v2, s2, v2 -; GFX67-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX67-NEXT: v_rcp_f32_e32 v0, s7 +; GFX67-NEXT: v_rcp_f32_e32 v1, s6 +; GFX67-NEXT: v_rcp_f32_e32 v4, s5 +; GFX67-NEXT: v_rcp_f32_e32 v5, s4 +; GFX67-NEXT: v_mul_f32_e32 v3, s3, v0 +; GFX67-NEXT: v_mul_f32_e32 v2, s2, v1 +; GFX67-NEXT: v_mul_f32_e32 v1, s1, v4 +; GFX67-NEXT: v_mul_f32_e32 v0, s0, v5 ; GFX67-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX67-NEXT: s_endpgm ; @@ -2064,33 +2064,33 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s4 -; GFX8-NEXT: v_rcp_f32_e32 v1, s5 -; GFX8-NEXT: v_rcp_f32_e32 v2, s6 -; GFX8-NEXT: v_rcp_f32_e32 v3, s7 -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX8-NEXT: v_rcp_f32_e32 v0, s7 +; GFX8-NEXT: v_rcp_f32_e32 v1, s6 +; GFX8-NEXT: v_rcp_f32_e32 v6, s5 +; GFX8-NEXT: v_rcp_f32_e32 v7, s4 +; GFX8-NEXT: v_mul_f32_e32 v3, s3, v0 +; GFX8-NEXT: v_mul_f32_e32 v2, s2, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, s1, v6 +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v7 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v4f32_arcp_math: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s4 -; GFX10-NEXT: v_rcp_f32_e32 v1, s5 -; GFX10-NEXT: v_rcp_f32_e32 v2, s6 -; GFX10-NEXT: v_rcp_f32_e32 v3, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, s2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, s3, v3 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_rcp_f32_e32 v1, s6 +; GFX10-NEXT: v_rcp_f32_e32 v4, s5 +; GFX10-NEXT: v_rcp_f32_e32 v5, s4 +; GFX10-NEXT: v_mul_f32_e32 v3, s3, v0 +; GFX10-NEXT: v_mul_f32_e32 v2, s2, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, s1, v4 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v5 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v4f32_arcp_math: @@ -2099,15 +2099,15 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s4 -; GFX11-NEXT: v_rcp_f32_e32 v1, s5 -; GFX11-NEXT: v_rcp_f32_e32 v2, s6 -; GFX11-NEXT: v_rcp_f32_e32 v3, s7 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 +; GFX11-NEXT: v_rcp_f32_e32 v1, s6 +; GFX11-NEXT: v_rcp_f32_e32 v4, s5 +; GFX11-NEXT: v_rcp_f32_e32 v5, s4 +; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mul_f32 v3, s3, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX11-NEXT: v_dual_mul_f32 v2, s2, v1 :: v_dual_mul_f32 v1, s1, v4 +; GFX11-NEXT: v_mul_f32_e32 v0, s0, v5 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[8:9] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm