Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1856,4 +1856,10 @@ [llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] >; + +// Emit correctly rounded fp32 divide and sqrt. +def int_amdgcn_fdiv_rounded : Intrinsic< + [llvm_float_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] +>; } Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -70,6 +70,7 @@ Module *Mod = nullptr; const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; + bool HasCorrectlyRoundedDivideSqrt = false; bool HasFP32Denormals = false; /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to @@ -542,34 +543,45 @@ // Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. +// +// Also, insert an intrinsic for safe fdiv when +// -cl-fp32-correctly-rounded-divide-sqrt is enabled. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); if (!Ty->getScalarType()->isFloatTy()) return false; - MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); - if (!FPMath) - return false; - - const FPMathOperator *FPOp = cast(&FDiv); - float ULP = FPOp->getFPAccuracy(); - if (ULP < 2.5f) - return false; - + const FPMathOperator *FPOp = cast(&FDiv); FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || FMF.allowReciprocal(); - // With UnsafeDiv node will be optimized to just rcp and mul. if (UnsafeDiv) return false; + bool SafeFast = true; + MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); + if (!FPMath) { + // Insert an intrinsic when -cl-fp32-correctly-rounded-divide-sqrt + // is enabled. + if (!HasCorrectlyRoundedDivideSqrt) + return false; + SafeFast = false; + } else { + float ULP = FPOp->getFPAccuracy(); + if (ULP < 2.5f) + return false; + } + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); + unsigned IntrinsicOpc = SafeFast ? Intrinsic::amdgcn_fdiv_fast + : Intrinsic::amdgcn_fdiv_rounded; + + Function *Decl = Intrinsic::getDeclaration(Mod, IntrinsicOpc); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); @@ -586,7 +598,7 @@ Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; - if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) { + if (SafeFast && shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); @@ -595,7 +607,7 @@ NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { - if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals)) + if (!SafeFast || !shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } @@ -613,6 +625,11 @@ return Attr.getValueAsString() == "true"; } +static bool hasCorrectlyRoundedDivideSqrt(const Function &F) { + Attribute Attr = F.getFnAttribute("correctly-rounded-divide-sqrt-fp-math"); + return (Attr.getValueAsString() == "true"); +} + static std::pair getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS) { Type *I32Ty = Builder.getInt32Ty(); @@ -1034,6 +1051,7 @@ AC = &getAnalysis().getAssumptionCache(F); DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); + HasCorrectlyRoundedDivideSqrt = hasCorrectlyRoundedDivideSqrt(F); HasFP32Denormals = ST->hasFP32Denormals(F); bool MadeChange = false; Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -80,6 +80,8 @@ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFDIV_ROUNDED(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFDIV_SAFE(SDValue LHS, SDValue RHS, SelectionDAG &DAG) const; SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5910,6 +5910,8 @@ } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); + case Intrinsic::amdgcn_fdiv_rounded: + return lowerFDIV_ROUNDED(Op, DAG); case Intrinsic::amdgcn_interp_p1_f16: { SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, Op.getOperand(5), SDValue()); @@ -7720,13 +7722,9 @@ return DAG.getTargetConstant(Mode, SL, MVT::i32); } -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) - return FastLowered; - SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); +SDValue SITargetLowering::lowerFDIV_SAFE(SDValue LHS, SDValue RHS, SelectionDAG &DAG) const { + SDLoc SL(RHS); const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); @@ -7824,6 +7822,21 @@ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } +SDValue SITargetLowering::lowerFDIV_ROUNDED(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + return lowerFDIV_SAFE(LHS, RHS, DAG); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + return lowerFDIV_SAFE(LHS, RHS, DAG); +} + SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { if (DAG.getTarget().Options.UnsafeFPMath) return lowerFastUnsafeFDIV(Op, DAG); Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -213,9 +213,36 @@ ret void } +; CHECK-LABEL: @fdiv_rounded( +; CHECK: %no.md = call float @llvm.amdgcn.fdiv.rounded(float %a, float %b) +define amdgpu_kernel void @fdiv_rounded(float addrspace(1)* %out, float %a, float %b) #3 { + %no.md = fdiv float %a, %b + store volatile float %no.md, float addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @fdiv_rounded_vector( +; CHECK: %[[Anomd0:[0-9]+]] = extractelement <2 x float> %a, i64 0 +; CHECK: %[[Bnomd0:[0-9]+]] = extractelement <2 x float> %b, i64 0 +; CHECK: %[[FDIVnomd0:[0-9]+]] = call float @llvm.amdgcn.fdiv.rounded(float %[[Anomd0]], float %[[Bnomd0]]) +; CHECK: %[[INSnomd0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIVnomd0]], i64 0 +; CHECK: %[[Anomd1:[0-9]+]] = extractelement <2 x float> %a, i64 1 +; CHECK: %[[Bnomd1:[0-9]+]] = extractelement <2 x float> %b, i64 1 +; CHECK: %[[FDIVnomd1:[0-9]+]] = call float @llvm.amdgcn.fdiv.rounded(float %[[Anomd1]], float %[[Bnomd1]]) +; CHECK: %no.md = insertelement <2 x float> %[[INSnomd0]], float %[[FDIVnomd1]], i64 1 +define amdgpu_kernel void @fdiv_rounded_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #3 { + %no.md = fdiv <2 x float> %a, %b + store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + + ret void +} + + attributes #0 = { nounwind optnone noinline } attributes #1 = { nounwind } attributes #2 = { nounwind "target-features"="+fp32-denormals" } +attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" } ; CHECK: !0 = !{float 2.500000e+00} ; CHECK: !1 = !{float 5.000000e-01} Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -73,6 +73,68 @@ ret void } +; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt: + +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX10: s_denorm_mode 15 +; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX10: s_denorm_mode 12 +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], + +define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #3 { +entry: + %fdiv = fdiv float 1.000000e+00, %a + store float %fdiv, float addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt: + +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; PREGFX10-NOT: s_setreg +; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; PREGFX10-NOT: s_setreg + +; GFX10-NOT: s_denorm_mode +; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] +; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] +; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] +; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]] +; GFX10-NOT: s_denorm_mode + +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], +define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #4 { +entry: + %fdiv = fdiv float 1.000000e+00, %a + store float %fdiv, float addrspace(1)* %out + ret void +} + + ; FUNC-LABEL: {{^}}fdiv_25ulp_f32: ; GCN: v_cndmask_b32 ; GCN: v_mul_f32 @@ -287,5 +349,8 @@ attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" } attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" } +attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } +attributes #4 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" "target-features"="+fp32-denormals,-flat-for-global" } + !0 = !{float 2.500000e+00}