diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -606,24 +606,23 @@ return true; } -// Perform RCP optimizations: +// Optimize fdiv with rcp: // -// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with -// denormals flushed. +// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is +// allowed with unsafe-fp-math or afn. // -// a/b -> a*rcp(b) when fast unsafe rcp is legal. -static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal, - IRBuilder<> Builder, MDNode *FPMath, Module *Mod, - bool HasDenormals, bool NeedHighAccuracy) { +// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, + bool RcpIsAccurate, IRBuilder<> Builder, + Module *Mod) { - Type *Ty = Den->getType(); - if (!FastUnsafeRcpLegal && Ty->isFloatTy() && - (HasDenormals || NeedHighAccuracy)) + if (!AllowInaccurateRcp && !RcpIsAccurate) return nullptr; + Type *Ty = Den->getType(); Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty); if (const ConstantFP *CLHS = dyn_cast(Num)) { - if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) { + if (AllowInaccurateRcp || RcpIsAccurate) { if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. @@ -648,49 +647,63 @@ } } - if (FastUnsafeRcpLegal) { + if (AllowInaccurateRcp) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) Value *Recip = Builder.CreateCall(Decl, { Den }); - return Builder.CreateFMul(Num, Recip, "", FPMath); + return Builder.CreateFMul(Num, Recip); } return nullptr; } -static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal, - bool HasDenormals) { - const ConstantFP *CNum = dyn_cast(Num); - if (!CNum) - return HasDenormals; +// optimize with fdiv.fast: +// +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. +// +// NOTE: optimizeWithRcp should be tried first because rcp is the preference. +static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, + bool HasDenormals, IRBuilder<> Builder, + Module *Mod) { + // fdiv.fast can achieve 2.5 ULP accuracy. + if (ReqdAccuracy < 2.5f) + return nullptr; - if (FastUnsafeRcpLegal) - return true; + // Only have fdiv.fast for f32. + Type *Ty = Den->getType(); + if (!Ty->isFloatTy()) + return nullptr; - bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); + bool NumIsOne = false; + if (const ConstantFP *CNum = dyn_cast(Num)) { + if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) + NumIsOne = true; + } - // Reciprocal f32 is handled separately without denormals. - return HasDenormals ^ IsOne; -} + // fdiv does not support denormals. But 1.0/x is always fine to use it. + if (HasDenormals && !NumIsOne) + return nullptr; + Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); + return Builder.CreateCall(Decl, { Num, Den }); +} -// Optimizations is performed based on fpmath, fast math flags as wells as -// denormals to lower fdiv using either rcp or fdiv.fast. +// Optimizations is performed based on fpmath, fast math flags as well as +// denormals to optimize fdiv with either rcp or fdiv.fast. +// +// With rcp: +// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is +// allowed with unsafe-fp-math or afn. // -// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on -// unsafe-fp-math, fast math flags, denormals and fpmath -// accuracy request. +// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. // -// RCP Optimizations: -// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with -// denormals flushed. -// a/b -> a*rcp(b) when fast unsafe rcp is legal. +// With fdiv.fast: +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. // -// Use fdiv.fast: -// a/b -> fdiv.fast(a, b) when RCP optimization is not performed and -// fpmath >= 2.5ULP with denormals flushed. +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. // -// 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and -// fpmath >= 2.5ULP with denormals. +// NOTE: rcp is the preference in cases that both are legal. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType()->getScalarType(); @@ -700,19 +713,17 @@ return false; const FPMathOperator *FPOp = cast(&FDiv); - MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); - const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f; + const float ReqdAccuracy = FPOp->getFPAccuracy(); + // Inaccurate rcp is allowed with unsafe-fp-math or afn. FastMathFlags FMF = FPOp->getFastMathFlags(); - // Determine whether it is ok to use rcp based on unsafe-fp-math, - // fast math flags, denormals and accuracy request. - const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() || - (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy) - || FMF.approxFunc())); + const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc(); - // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used. - const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy && - !FastUnsafeRcpLegal; + // rcp_f16 is accurate for !fpmath >= 1.0ulp. + // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rcp_f64 is never accurate. + const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) || + (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f); IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(FMF); @@ -730,31 +741,24 @@ for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); - Value *NewElt = nullptr; - if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal, - HasFP32Denormals)) { - Function *Decl = - Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath); - } - if (!NewElt) // Try rcp. - NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder, - FPMath, Mod, HasFP32Denormals, NeedHighAccuracy); - if (!NewElt) - NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath); + // Try rcp first. + Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp, + RcpIsAccurate, Builder, Mod); + if (!NewElt) // Try fdiv.fast. + NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, + HasFP32Denormals, Builder, Mod); + if (!NewElt) // Keep the original. + NewElt = Builder.CreateFDiv(NumEltI, DenEltI); NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } - } else { // Scalar. - if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal, - HasFP32Denormals)) { - Function *Decl = - Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath); - } - if (!NewFDiv) { // Try rcp. - NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath, - Mod, HasFP32Denormals, NeedHighAccuracy); + } else { // Scalar FDiv. + // Try rcp first. + NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate, + Builder, Mod); + if (!NewFDiv) { // Try fdiv.fast. + NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals, + Builder, Mod); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7418,19 +7418,12 @@ EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath || - (Flags.hasAllowReciprocal() && - ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) || - VT == MVT::f16 || - Flags.hasApproximateFuncs())); - - // Do rcp optimization only when fast unsafe rcp is legal here. - // NOTE: We already performed RCP optimization to insert intrinsics in - // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to - // rcp optimization. - // However, there are cases like FREM, which is expended into a sequence - // of instructions including FDIV, which may expose new opportunities. - if (!FastUnsafeRcpLegal) + bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath || + Flags.hasApproximateFuncs(); + + // Without !fpmath accuracy information, we can't do more because we don't + // know exactly whether rcp is accurate enough to meet !fpmath requirement. + if (!AllowInaccurateRcp) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -12,14 +12,14 @@ ; CHECK-LABEL: @fdiv_fpmath( ; CHECK: %no.md = fdiv float %a, %b{{$}} -; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1 -; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 -; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 +; CHECK: %md.half.ulp = fdiv float %a, %b +; CHECK: %md.1ulp = fdiv float %a, %b +; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b) +; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b) ; CHECK: %[[FAST_RCP:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b) -; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]], !fpmath !0 -; CHECK: %[[ARCP_RCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b) -; CHECK: arcp.md.25ulp = fmul arcp float %a, %[[ARCP_RCP]], !fpmath !0 +; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]] +; CHECK: %[[AFN_RCP:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b) +; CHECK: afn.md.25ulp = fmul afn float %a, %[[AFN_RCP]] define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out @@ -39,8 +39,8 @@ %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 store volatile float %fast.md.25ulp, float addrspace(1)* %out - %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 - store volatile float %arcp.md.25ulp, float addrspace(1)* %out + %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0 + store volatile float %afn.md.25ulp, float addrspace(1)* %out ret void } @@ -48,9 +48,9 @@ ; CHECK-LABEL: @rcp_fdiv_fpmath( ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}} ; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x) -; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1 -; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x -; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x) +; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x +; CHECK: %afn.no.md = call afn float @llvm.amdgcn.rcp.f32(float %x) +; CHECK: %afn.25ulp = call afn float @llvm.amdgcn.rcp.f32(float %x) ; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x) ; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x) define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { @@ -63,11 +63,11 @@ %md.half.ulp = fdiv float 1.0, %x, !fpmath !1 store volatile float %md.half.ulp, float addrspace(1)* %out - %arcp.no.md = fdiv arcp float 1.0, %x - store volatile float %arcp.no.md, float addrspace(1)* %out + %afn.no.md = fdiv afn float 1.0, %x + store volatile float %afn.no.md, float addrspace(1)* %out - %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0 - store volatile float %arcp.25ulp, float addrspace(1)* %out + %afn.25ulp = fdiv afn float 1.0, %x, !fpmath !0 + store volatile float %afn.25ulp, float addrspace(1)* %out %fast.no.md = fdiv fast float 1.0, %x store volatile float %fast.no.md, float addrspace(1)* %out @@ -78,28 +78,6 @@ ret void } -; CHECK-LABEL: @rcp_fdiv_arcp_denormal( -; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0 -; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2 -; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x) -; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x) -define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 { - - %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0 - store volatile float %arcp.low.accuracy, float addrspace(1)* %out - - %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2 - store volatile float %arcp.high.accuracy, float addrspace(1)* %out - - %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0 - store volatile float %arcp.low.afn, float addrspace(1)* %out - - %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2 - store volatile float %arcp.high.afn, float addrspace(1)* %out - - ret void -} - ; CHECK-LABEL: @fdiv_fpmath_vector( ; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 ; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 @@ -113,31 +91,31 @@ ; CHECK: %[[HALF_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 ; CHECK: %[[HALF_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 -; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]], !fpmath !1 +; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]] ; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0 ; CHECK: %[[HALF_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 ; CHECK: %[[HALF_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 -; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]], !fpmath !1 +; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]] ; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1 ; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out ; CHECK: %[[ONE_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 ; CHECK: %[[ONE_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 -; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]], !fpmath !2 +; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]] ; CHECK: %[[ONE_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ONE_FDIV0]], i64 0 ; CHECK: %[[ONE_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 ; CHECK: %[[ONE_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 -; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]], !fpmath !2 +; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]] ; CHECK: %md.1ulp = insertelement <2 x float> %[[ONE_INS0]], float %[[ONE_FDIV1]], i64 1 ; CHECK: store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out ; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 ; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 -; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0 +; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]) ; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0 ; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 -; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0 +; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]) ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1 define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 { %no.md = fdiv <2 x float> %a, %b @@ -165,20 +143,20 @@ ; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out ; CHECK: %[[HALF0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1 +; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]] ; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0 ; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1 +; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF1]] ; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1 ; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out -; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]] -; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0 -; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO1]] -; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1 -; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out +; CHECK: %[[AFN_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]]) +; CHECK: %[[AFN_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_NO_FDIV0]], i64 0 +; CHECK: %[[AFN_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]]) +; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out ; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 ; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]]) @@ -188,13 +166,13 @@ ; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_RCP1]], i64 1 ; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out -; CHECK: %[[ARCP_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]]) -; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0 -; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: %[[ARCP_25_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]]) -; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_RCP1]], i64 1 -; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out +; CHECK: %[[AFN_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]]) +; CHECK: %[[AFN_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_25_RCP0]], i64 0 +; CHECK: %[[AFN_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[AFN_25_RCP1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]]) +; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_RCP1]], i64 1 +; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out ; CHECK: %[[FAST_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 ; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]]) @@ -210,14 +188,14 @@ %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1 store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out - %arcp.no.md = fdiv arcp <2 x float> , %x - store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + %afn.no.md = fdiv afn <2 x float> , %x + store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out %fast.no.md = fdiv fast <2 x float> , %x store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out - %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 - store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + %afn.25ulp = fdiv afn <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out @@ -234,13 +212,14 @@ ; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1 ; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out -; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]] -; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0 -; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 2.000000e+00, %[[ARCP_NO1]] -; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1 -; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out +; CHECK: %[[AFN_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]]) +; CHECK: %[[AFN_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_NO_FDIV0]], i64 0 +; CHECK: %[[AFN_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]]) +; CHECK: %[[AFN_NO_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_NO_FDIV1]] +; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_MUL1]], i64 1 +; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out ; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 ; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]]) @@ -251,14 +230,14 @@ ; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_MUL1]], i64 1 ; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out -; CHECK: %[[ARCP_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]]) -; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0 -; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: %[[ARCP_25_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]]) -; CHECK: %[[ARCP_25_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_25_RCP1]] -; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_MUL1]], i64 1 -; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out +; CHECK: %[[AFN_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]]) +; CHECK: %[[AFN_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_25_RCP0]], i64 0 +; CHECK: %[[AFN_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[AFN_25_RCP1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]]) +; CHECK: %[[AFN_25_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_25_RCP1]] +; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_MUL1]], i64 1 +; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out ; CHECK: %[[FAST_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 ; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]]) @@ -272,14 +251,14 @@ %no.md = fdiv <2 x float> , %x store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out - %arcp.no.md = fdiv arcp <2 x float> , %x - store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + %afn.no.md = fdiv afn <2 x float> , %x + store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out %fast.no.md = fdiv fast <2 x float> , %x store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out - %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 - store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + %afn.25ulp = fdiv afn <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out @@ -288,34 +267,34 @@ } ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant( -; CHECK: %[[ARCP_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0 -; CHECK: %[[ARCP_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0 -; CHECK: %[[ARCP_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B0]]) -; CHECK: %[[ARCP_MUL0:[0-9]+]] = fmul arcp float %[[ARCP_A0]], %[[ARCP_RCP0]], !fpmath !0 -; CHECK: %[[ARCP_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_MUL0]], i64 0 -; CHECK: %[[ARCP_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1 -; CHECK: %[[ARCP_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1 -; CHECK: %[[ARCP_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B1]]) -; CHECK: %[[ARCP_MUL1:[0-9]+]] = fmul arcp float %[[ARCP_A1]], %[[ARCP_RCP1]], !fpmath !0 -; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_INS0]], float %[[ARCP_MUL1]], i64 1 -; CHECK: store volatile <2 x float> %arcp.25ulp +; CHECK: %[[AFN_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0 +; CHECK: %[[AFN_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0 +; CHECK: %[[AFN_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_B0]]) +; CHECK: %[[AFN_MUL0:[0-9]+]] = fmul afn float %[[AFN_A0]], %[[AFN_RCP0]] +; CHECK: %[[AFN_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_MUL0]], i64 0 +; CHECK: %[[AFN_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1 +; CHECK: %[[AFN_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1 +; CHECK: %[[AFN_RCP1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_B1]]) +; CHECK: %[[AFN_MUL1:[0-9]+]] = fmul afn float %[[AFN_A1]], %[[AFN_RCP1]] +; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_INS0]], float %[[AFN_MUL1]], i64 1 +; CHECK: store volatile <2 x float> %afn.25ulp ; CHECK: %[[FAST_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0 ; CHECK: %[[FAST_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0 ; CHECK: %[[FAST_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B0]]) -; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]], !fpmath !0 +; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]] ; CHECK: %[[FAST_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_MUL0]], i64 0 ; CHECK: %[[FAST_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1 ; CHECK: %[[FAST_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1 ; CHECK: %[[FAST_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B1]]) -; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]], !fpmath !0 +; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]] ; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_INS0]], float %[[FAST_MUL1]], i64 1 ; CHECK: store volatile <2 x float> %fast.25ulp define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 - %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 - store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + %afn.25ulp = fdiv afn <2 x float> %x.insert, %y, !fpmath !0 + store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out @@ -325,13 +304,14 @@ ; CHECK-LABEL: @fdiv_fpmath_f32_denormals( ; CHECK: %no.md = fdiv float %a, %b{{$}} -; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1 -; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 -; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0 -; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 +; CHECK: %md.half.ulp = fdiv float %a, %b +; CHECK: %md.1ulp = fdiv float %a, %b +; CHECK: %md.25ulp = fdiv float %a, %b +; CHECK: %md.3ulp = fdiv float %a, %b ; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b) -; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0 -; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 +; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]] +; CHECK: %[[RCP_AFN:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b) +; CHECK: %afn.md.25ulp = fmul afn float %a, %[[RCP_AFN]] define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out @@ -351,8 +331,8 @@ %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 store volatile float %fast.md.25ulp, float addrspace(1)* %out - %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 - store volatile float %arcp.md.25ulp, float addrspace(1)* %out + %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0 + store volatile float %afn.md.25ulp, float addrspace(1)* %out ret void } @@ -361,11 +341,6 @@ attributes #1 = { nounwind } attributes #2 = { nounwind "target-features"="+fp32-denormals" } -; CHECK: !0 = !{float 2.500000e+00} -; CHECK: !1 = !{float 5.000000e-01} -; CHECK: !2 = !{float 1.000000e+00} -; CHECK: !3 = !{float 3.000000e+00} - !0 = !{float 2.500000e+00} !1 = !{float 5.000000e-01} !2 = !{float 1.000000e+00} diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -63,7 +63,7 @@ %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b - %r.val = fdiv half 1.0, %b.val + %r.val = fdiv half 1.0, %b.val, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -82,25 +82,46 @@ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b %b.abs = call half @llvm.fabs.f16(half %b.val) - %r.val = fdiv half 1.0, %b.abs + %r.val = fdiv half 1.0, %b.abs, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } -; GCN-LABEL: {{^}}v_rcp_f16_arcp: +; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp. + +; GCN-LABEL: {{^}}reciprocal_f16_rounded: +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} +; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]] +; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]] +; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]] +; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0 +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %r.val = fdiv half 1.0, %b.val + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_rcp_f16_afn: ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] ; GFX8_9_10-NOT: [[VAL]] ; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GFX8_9_10-NOT: [[RESULT]] ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b - %r.val = fdiv arcp half 1.0, %b.val + %r.val = fdiv afn half 1.0, %b.val, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -118,7 +139,7 @@ %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b - %r.val = fdiv half -1.0, %b.val + %r.val = fdiv half -1.0, %b.val, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -137,7 +158,7 @@ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b %b.sqrt = call half @llvm.sqrt.f16(half %b.val) - %r.val = fdiv half 1.0, %b.sqrt + %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -157,12 +178,12 @@ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b %b.sqrt = call half @llvm.sqrt.f16(half %b.val) - %r.val = fdiv half -1.0, %b.sqrt + %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } -; GCN-LABEL: {{^}}v_fdiv_f16_arcp: +; GCN-LABEL: {{^}}v_fdiv_f16_afn: ; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] ; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] @@ -170,7 +191,7 @@ ; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -179,7 +200,7 @@ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %a.val = load volatile half, half addrspace(1)* %gep.a %b.val = load volatile half, half addrspace(1)* %gep.b - %r.val = fdiv arcp half %a.val, %b.val + %r.val = fdiv afn half %a.val, %b.val store half %r.val, half addrspace(1)* %gep.r ret void } @@ -206,38 +227,38 @@ ret void } -; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16: +; FUNC-LABEL: {{^}}div_afn_2_x_pat_f16: ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} ; GFX8_9_10: buffer_store_short [[MUL]] -define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef - %rcp = fdiv arcp half %x, 2.0 + %rcp = fdiv afn half %x, 2.0 store half %rcp, half addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16: -; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}} +; FUNC-LABEL: {{^}}div_afn_k_x_pat_f16: +; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}} ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} ; GFX8_9_10: buffer_store_short [[MUL]] -define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef - %rcp = fdiv arcp half %x, 10.0 + %rcp = fdiv afn half %x, 10.0 store half %rcp, half addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16: -; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}} +; FUNC-LABEL: {{^}}div_afn_neg_k_x_pat_f16: +; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}} ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} ; GFX8_9_10: buffer_store_short [[MUL]] -define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef - %rcp = fdiv arcp half %x, -10.0 + %rcp = fdiv afn half %x, -10.0 store half %rcp, half addrspace(1)* %out, align 4 ret void } @@ -249,3 +270,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind "unsafe-fp-math"="true" } + +!0 = !{float 2.500000e+00}