Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -606,24 +606,22 @@ return true; } -// Perform RCP optimizations: +// lowerUsingRcp: // -// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with -// denormals flushed. +// 1/x -> rcp(x) when fdiv is allowed to be re-associated or rcp is accurate. // -// a/b -> a*rcp(b) when fast unsafe rcp is legal. -static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal, - IRBuilder<> Builder, MDNode *FPMath, Module *Mod, - bool HasDenormals, bool NeedHighAccuracy) { +// a/b -> a*rcp(b) when fdiv is allowed to be re-associated. +static Value *lowerUsingRcp (Value *Num, Value *Den, bool CanReassociateFDiv, + bool RcpIsAccurate, IRBuilder<> Builder, + MDNode *FPMath, Module *Mod) { - Type *Ty = Den->getType(); - if (!FastUnsafeRcpLegal && Ty->isFloatTy() && - (HasDenormals || NeedHighAccuracy)) + if (!CanReassociateFDiv && !RcpIsAccurate) return nullptr; + Type *Ty = Den->getType(); Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty); if (const ConstantFP *CLHS = dyn_cast(Num)) { - if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) { + if (CanReassociateFDiv || RcpIsAccurate) { if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. @@ -648,7 +646,7 @@ } } - if (FastUnsafeRcpLegal) { + if (CanReassociateFDiv) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) Value *Recip = Builder.CreateCall(Decl, { Den }); @@ -657,40 +655,54 @@ return nullptr; } -static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal, - bool HasDenormals) { - const ConstantFP *CNum = dyn_cast(Num); - if (!CNum) - return HasDenormals; +// lowerUsingFDivFast: +// +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. +// +// NOTE: lowerUsingRcp should be tried first because rcp is the preference. +static Value *lowerUsingFDivFast(Value *Num, Value *Den, float ReqdAccuracy, + bool HasDenormals, IRBuilder<> Builder, + MDNode *FPMath, Module *Mod) { + // fdiv.fast can achieve 2.5 ULP accuracy. + if (ReqdAccuracy < 2.5f) + return nullptr; - if (FastUnsafeRcpLegal) - return true; + // Only have fdiv.fast for f32. + Type *Ty = Den->getType(); + if (!Ty->isFloatTy()) + return nullptr; - bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); + bool NumIsOne = false; + if (const ConstantFP *CNum = dyn_cast(Num)) { + if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) + NumIsOne = true; + } - // Reciprocal f32 is handled separately without denormals. - return HasDenormals ^ IsOne; -} + // fdiv does not support denormals. But 1.0/x is always fine to use it. + if (HasDenormals && !NumIsOne) + return nullptr; + Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); + return Builder.CreateCall(Decl, { Num, Den }, "", FPMath); +} // Optimizations is performed based on fpmath, fast math flags as wells as // denormals to lower fdiv using either rcp or fdiv.fast. // -// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on -// unsafe-fp-math, fast math flags, denormals and fpmath -// accuracy request. +// Use rcp: +// 1/x -> rcp(x) when fdiv is allowed to be re-associated or rcp is +// sufficiently accurate. // -// RCP Optimizations: -// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with -// denormals flushed. -// a/b -> a*rcp(b) when fast unsafe rcp is legal. +// a/b -> a*rcp(b) fdiv is allowed to be re-associated. // // Use fdiv.fast: -// a/b -> fdiv.fast(a, b) when RCP optimization is not performed and -// fpmath >= 2.5ULP with denormals flushed. +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. // -// 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and -// fpmath >= 2.5ULP with denormals. +// Using rcp is the preference. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType()->getScalarType(); @@ -701,18 +713,16 @@ const FPMathOperator *FPOp = cast(&FDiv); MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); - const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f; + const float ReqdAccuracy = FPOp->getFPAccuracy(); FastMathFlags FMF = FPOp->getFastMathFlags(); - // Determine whether it is ok to use rcp based on unsafe-fp-math, - // fast math flags, denormals and accuracy request. - const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() || - (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy) - || FMF.approxFunc())); + const bool CanReassociateFDiv = HasUnsafeFPMath || FMF.allowReciprocal(); - // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used. - const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy && - !FastUnsafeRcpLegal; + // rcp_f16 is accurate for !fpmath >= 1.0ulp. + // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rcp_f64 is never accurate. + const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) || + (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f); IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(FMF); @@ -730,31 +740,24 @@ for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); - Value *NewElt = nullptr; - if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal, - HasFP32Denormals)) { - Function *Decl = - Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath); - } - if (!NewElt) // Try rcp. - NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder, - FPMath, Mod, HasFP32Denormals, NeedHighAccuracy); - if (!NewElt) + // Try rcp first. + Value *NewElt = lowerUsingRcp(NumEltI, DenEltI, CanReassociateFDiv, + RcpIsAccurate, Builder, FPMath, Mod); + if (!NewElt) // Try fdiv.fast. + NewElt = lowerUsingFDivFast(NumEltI, DenEltI, ReqdAccuracy, + HasFP32Denormals, Builder, FPMath, Mod); + if (!NewElt) // Keep the original. NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath); NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } - } else { // Scalar. - if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal, - HasFP32Denormals)) { - Function *Decl = - Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath); - } - if (!NewFDiv) { // Try rcp. - NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath, - Mod, HasFP32Denormals, NeedHighAccuracy); + } else { // Scalar FDiv. + // Try rcp first. + NewFDiv = lowerUsingRcp(Num, Den, CanReassociateFDiv, RcpIsAccurate, + Builder, FPMath, Mod); + if (!NewFDiv) { // Try fdiv.fast. + NewFDiv = lowerUsingFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals, + Builder, FPMath, Mod); } } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7470,19 +7470,12 @@ EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath || - (Flags.hasAllowReciprocal() && - ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) || - VT == MVT::f16 || - Flags.hasApproximateFuncs())); - - // Do rcp optimization only when fast unsafe rcp is legal here. - // NOTE: We already performed RCP optimization to insert intrinsics in - // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to - // rcp optimization. - // However, there are cases like FREM, which is expended into a sequence - // of instructions including FDIV, which may expose new opportunities. - if (!FastUnsafeRcpLegal) + bool CanReassociateFDiv = DAG.getTarget().Options.UnsafeFPMath || + Flags.hasAllowReciprocal(); + + // Without !fpmath accuracy information, we can't do more because we don't + // know exactly whether rcp is accurate enough to meet !fpmath requirement. + if (!CanReassociateFDiv) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -49,7 +49,7 @@ ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}} ; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x) ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1 -; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x +; CHECK: %arcp.no.md = call arcp float @llvm.amdgcn.rcp.f32(float %x) ; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x) ; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x) ; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x) @@ -78,28 +78,6 @@ ret void } -; CHECK-LABEL: @rcp_fdiv_arcp_denormal( -; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0 -; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2 -; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x) -; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x) -define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 { - - %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0 - store volatile float %arcp.low.accuracy, float addrspace(1)* %out - - %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2 - store volatile float %arcp.high.accuracy, float addrspace(1)* %out - - %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0 - store volatile float %arcp.low.afn, float addrspace(1)* %out - - %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2 - store volatile float %arcp.high.afn, float addrspace(1)* %out - - ret void -} - ; CHECK-LABEL: @fdiv_fpmath_vector( ; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 ; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 @@ -173,10 +151,10 @@ ; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out ; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]] +; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO0]]) ; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0 ; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO1]] +; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO1]]) ; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1 ; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out @@ -235,11 +213,12 @@ ; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out ; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]] +; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO0]]) ; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0 ; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 2.000000e+00, %[[ARCP_NO1]] -; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1 +; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO1]]) +; CHECK: %[[ARCP_NO_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_NO_FDIV1]] +; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_MUL1]], i64 1 ; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out ; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 @@ -331,7 +310,8 @@ ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 ; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b) ; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0 -; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 +; CHECK: %[[RCP_ARCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b) +; CHECK: %arcp.md.25ulp = fmul arcp float %a, %[[RCP_ARCP]], !fpmath !0 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -63,7 +63,7 @@ %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b - %r.val = fdiv half 1.0, %b.val + %r.val = fdiv half 1.0, %b.val, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -82,7 +82,28 @@ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b %b.abs = call half @llvm.fabs.f16(half %b.val) - %r.val = fdiv half 1.0, %b.abs + %r.val = fdiv half 1.0, %b.abs, !fpmath !0 + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp. + +; GCN-LABEL: {{^}}reciprocal_f16_rounded: +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} +; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]] +; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]] +; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]] +; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0 +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %r.val = fdiv half 1.0, %b.val store half %r.val, half addrspace(1)* %gep.r ret void } @@ -100,7 +121,7 @@ %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b - %r.val = fdiv arcp half 1.0, %b.val + %r.val = fdiv arcp half 1.0, %b.val, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -118,7 +139,7 @@ %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b - %r.val = fdiv half -1.0, %b.val + %r.val = fdiv half -1.0, %b.val, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -137,7 +158,7 @@ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b %b.sqrt = call half @llvm.sqrt.f16(half %b.val) - %r.val = fdiv half 1.0, %b.sqrt + %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -157,7 +178,7 @@ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext %b.val = load volatile half, half addrspace(1)* %gep.b %b.sqrt = call half @llvm.sqrt.f16(half %b.val) - %r.val = fdiv half -1.0, %b.sqrt + %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0 store half %r.val, half addrspace(1)* %gep.r ret void } @@ -249,3 +270,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind "unsafe-fp-math"="true" } + +!0 = !{float 2.500000e+00}