Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -525,12 +525,64 @@ return true; } -static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { +// Perform RCP optimizations: +// +// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with +// denormals flushed. +// +// a/b -> a*rcp(b) when fast unsafe rcp is legal. +static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal, + IRBuilder<> Builder, MDNode *FPMath, Module *Mod, + bool HasDenormals, bool NeedHighAccuracy) { + + Type *Ty = Den->getType(); + if (!FastUnsafeRcpLegal && Ty->isFloatTy() && + (HasDenormals || NeedHighAccuracy)) + return nullptr; + + Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty); + if (const ConstantFP *CLHS = dyn_cast(Num)) { + if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) { + if (CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. + + // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't + // insert rsq intrinsic here. + + // 1.0 / x -> rcp(x) + return Builder.CreateCall(Decl, { Den }); + } + + // Same as for 1.0, but expand the sign out of the constant. + if (CLHS->isExactlyValue(-1.0)) { + // -1.0 / x -> rcp (fneg x) + Value *FNeg = Builder.CreateFNeg(Den); + return Builder.CreateCall(Decl, { FNeg }); + } + } + } + + if (FastUnsafeRcpLegal) { + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + Value *Recip = Builder.CreateCall(Decl, { Den }); + return Builder.CreateFMul(Num, Recip, "", FPMath); + } + return nullptr; +} + +static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal, + bool HasDenormals) { const ConstantFP *CNum = dyn_cast(Num); if (!CNum) return HasDenormals; - if (UnsafeDiv) + if (FastUnsafeRcpLegal) return true; bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); @@ -539,44 +591,57 @@ return HasDenormals ^ IsOne; } -// Insert an intrinsic for fast fdiv for safe math situations where we can -// reduce precision. Leave fdiv for situations where the generic node is -// expected to be optimized. + +// Optimizations is performed based on fpmath, fast math flags as wells as +// denormals to lower fdiv using either rcp or fdiv.fast. +// +// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on +// unsafe-fp-math, fast math flags, denormals and fpmath +// accuracy request. +// +// RCP Optimizations: +// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with +// denormals flushed. +// a/b -> a*rcp(b) when fast unsafe rcp is legal. +// +// Use fdiv.fast: +// a/b -> fdiv.fast(a, b) when RCP optimization is not performed and +// fpmath >= 2.5ULP with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and +// fpmath >= 2.5ULP with denormals. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { - Type *Ty = FDiv.getType(); - if (!Ty->getScalarType()->isFloatTy()) - return false; + Type *Ty = FDiv.getType()->getScalarType(); - MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); - if (!FPMath) + // No intrinsic for fdiv16 if target does not support f16. + if (Ty->isHalfTy() && !ST->has16BitInsts()) return false; const FPMathOperator *FPOp = cast(&FDiv); - float ULP = FPOp->getFPAccuracy(); - if (ULP < 2.5f) - return false; + MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); + const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f; FastMathFlags FMF = FPOp->getFastMathFlags(); - bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || - FMF.allowReciprocal(); + // Determine whether it is ok to use rcp based on unsafe-fp-math, + // fast math flags, denormals and accuracy request. + const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() || + (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy) + || FMF.approxFunc())); - // With UnsafeDiv node will be optimized to just rcp and mul. - if (UnsafeDiv) - return false; + // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used. + const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy && + !FastUnsafeRcpLegal; - IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; - - if (VectorType *VT = dyn_cast(Ty)) { + if (VectorType *VT = dyn_cast(FDiv.getType())) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially @@ -584,19 +649,32 @@ for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); - Value *NewElt; - - if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) { - NewElt = Builder.CreateFDiv(NumEltI, DenEltI); - } else { - NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); + Value *NewElt = nullptr; + if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal, + HasFP32Denormals)) { + Function *Decl = + Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); + NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath); } + if (!NewElt) // Try rcp. + NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder, + FPMath, Mod, HasFP32Denormals, NeedHighAccuracy); + if (!NewElt) + NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath); NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } - } else { - if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals)) - NewFDiv = Builder.CreateCall(Decl, { Num, Den }); + } else { // Scalar. + if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal, + HasFP32Denormals)) { + Function *Decl = + Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); + NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath); + } + if (!NewFDiv) { // Try rcp. + NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath, + Mod, HasFP32Denormals, NeedHighAccuracy); + } } if (NewFDiv) { Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7537,49 +7537,54 @@ SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); - if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) + bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath || + (Flags.hasAllowReciprocal() && + ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) || + VT == MVT::f16 || + Flags.hasApproximateFuncs())); + + // Do rcp optimization only when fast unsafe rcp is legal here. + // NOTE: We already performed RCP optimization to insert intrinsics in + // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to + // rcp optimization. + // However, there are cases like FREM, which is expended into a sequence + // of instructions including FDIV, which may expose new opportunities. + if (!FastUnsafeRcpLegal) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { - if (Unsafe || VT == MVT::f32 || VT == MVT::f16) { - if (CLHS->isExactlyValue(1.0)) { - // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to - // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. - // - // v_rcp_f16 and v_rsq_f16 DO support denormals. - - // 1.0 / sqrt(x) -> rsq(x) - - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP - // error seems really high at 2^29 ULP. - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - - // 1.0 / x -> rcp(x) - return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - } + if (CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. - // Same as for 1.0, but expand the sign out of the constant. - if (CLHS->isExactlyValue(-1.0)) { - // -1.0 / x -> rcp (fneg x) - SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); - } + // 1.0 / sqrt(x) -> rsq(x) + + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); } - } - if (Unsafe) { - // Turn into multiply by the reciprocal. - // x / y -> x * (1.0 / y) - SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); + // Same as for 1.0, but expand the sign out of the constant. + if (CLHS->isExactlyValue(-1.0)) { + // -1.0 / x -> rcp (fneg x) + SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); + } } - return SDValue(); + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); } static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, @@ -8725,6 +8730,11 @@ N->getFlags()); } + if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) { + return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, + N0.getOperand(0), N->getFlags()); + } + return AMDGPUTargetLowering::performRcpCombine(N, DCI); } Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -16,8 +16,10 @@ ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 ; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 -; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 -; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 +; CHECK: %[[FAST_RCP:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b) +; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]], !fpmath !0 +; CHECK: %[[ARCP_RCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b) +; CHECK: arcp.md.25ulp = fmul arcp float %a, %[[ARCP_RCP]], !fpmath !0 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out @@ -45,12 +47,12 @@ ; CHECK-LABEL: @rcp_fdiv_fpmath( ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}} -; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0 +; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x) ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1 -; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}} -; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0 -; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}} -; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0 +; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x +; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x) +; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x) +; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x) define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { %no.md = fdiv float 1.0, %x store volatile float %no.md, float addrspace(1)* %out @@ -76,10 +78,58 @@ ret void } +; CHECK-LABEL: @rcp_fdiv_arcp_denormal( +; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0 +; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2 +; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x) +; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x) +define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 { + + %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0 + store volatile float %arcp.low.accuracy, float addrspace(1)* %out + + %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2 + store volatile float %arcp.high.accuracy, float addrspace(1)* %out + + %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0 + store volatile float %arcp.low.afn, float addrspace(1)* %out + + %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2 + store volatile float %arcp.high.afn, float addrspace(1)* %out + + ret void +} + ; CHECK-LABEL: @fdiv_fpmath_vector( -; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}} -; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1 -; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2 +; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 +; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 +; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float %[[NO_A0]], %[[NO_B0]] +; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0 +; CHECK: %[[NO_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 +; CHECK: %[[NO_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 +; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float %[[NO_A1]], %[[NO_B1]] +; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + +; CHECK: %[[HALF_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 +; CHECK: %[[HALF_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 +; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]], !fpmath !1 +; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0 +; CHECK: %[[HALF_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 +; CHECK: %[[HALF_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 +; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]], !fpmath !1 +; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out + +; CHECK: %[[ONE_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 +; CHECK: %[[ONE_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 +; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]], !fpmath !2 +; CHECK: %[[ONE_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ONE_FDIV0]], i64 0 +; CHECK: %[[ONE_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 +; CHECK: %[[ONE_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 +; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]], !fpmath !2 +; CHECK: %md.1ulp = insertelement <2 x float> %[[ONE_INS0]], float %[[ONE_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out ; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 ; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 @@ -106,12 +156,52 @@ } ; CHECK-LABEL: @rcp_fdiv_fpmath_vector( -; CHECK: %no.md = fdiv <2 x float> , %x{{$}} -; CHECK: %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1 -; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x{{$}} -; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}} -; CHECK: %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 -; CHECK: %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 +; CHECK: %[[NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]] +; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0 +; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[NO1]] +; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + +; CHECK: %[[HALF0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1 +; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0 +; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1 +; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out + +; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]] +; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0 +; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO1]] +; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + +; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]]) +; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0 +; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[FAST_NO_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]]) +; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_RCP1]], i64 1 +; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out + +; CHECK: %[[ARCP_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]]) +; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0 +; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[ARCP_25_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]]) +; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_RCP1]], i64 1 +; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + +; CHECK: %[[FAST_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]]) +; CHECK: %[[FAST_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_25_RCP0]], i64 0 +; CHECK: %[[FAST_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[FAST_25_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]]) +; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_RCP1]], i64 1 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> , %x @@ -136,12 +226,48 @@ } ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat( -; CHECK: %no.md = fdiv <2 x float> , %x -; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x -; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}} -; CHECK: %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 -; CHECK: %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 -; CHECK: store volatile <2 x float> %fast.25ulp +; CHECK: %[[NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]] +; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0 +; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 2.000000e+00, %[[NO1]] +; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + +; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]] +; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0 +; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 2.000000e+00, %[[ARCP_NO1]] +; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1 +; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + +; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]]) +; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0 +; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[FAST_NO_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]]) +; CHECK: %[[FAST_NO_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_NO_RCP1]] +; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_MUL1]], i64 1 +; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out + +; CHECK: %[[ARCP_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]]) +; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0 +; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[ARCP_25_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]]) +; CHECK: %[[ARCP_25_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_25_RCP1]] +; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_MUL1]], i64 1 +; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + +; CHECK: %[[FAST_250:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]]) +; CHECK: %[[FAST_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_25_RCP0]], i64 0 +; CHECK: %[[FAST_251:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: %[[FAST_25_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]]) +; CHECK: %[[FAST_25_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_25_RCP1]] +; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_MUL1]], i64 1 +; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> , %x store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out @@ -161,12 +287,29 @@ ret void } -; FIXME: Should be able to get fdiv for 1.0 component ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant( -; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 +; CHECK: %[[ARCP_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0 +; CHECK: %[[ARCP_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0 +; CHECK: %[[ARCP_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B0]]) +; CHECK: %[[ARCP_MUL0:[0-9]+]] = fmul arcp float %[[ARCP_A0]], %[[ARCP_RCP0]], !fpmath !0 +; CHECK: %[[ARCP_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_MUL0]], i64 0 +; CHECK: %[[ARCP_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1 +; CHECK: %[[ARCP_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1 +; CHECK: %[[ARCP_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B1]]) +; CHECK: %[[ARCP_MUL1:[0-9]+]] = fmul arcp float %[[ARCP_A1]], %[[ARCP_RCP1]], !fpmath !0 +; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_INS0]], float %[[ARCP_MUL1]], i64 1 ; CHECK: store volatile <2 x float> %arcp.25ulp -; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 +; CHECK: %[[FAST_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0 +; CHECK: %[[FAST_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0 +; CHECK: %[[FAST_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B0]]) +; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]], !fpmath !0 +; CHECK: %[[FAST_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_MUL0]], i64 0 +; CHECK: %[[FAST_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1 +; CHECK: %[[FAST_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1 +; CHECK: %[[FAST_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B1]]) +; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]], !fpmath !0 +; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_INS0]], float %[[FAST_MUL1]], i64 1 ; CHECK: store volatile <2 x float> %fast.25ulp define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 @@ -186,8 +329,9 @@ ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 ; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 -; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 -; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 +; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b) +; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0 +; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -284,6 +284,68 @@ ret void } +; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt: + +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX10: s_denorm_mode 15 +; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX10: s_denorm_mode 12 +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], + +define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 { +entry: + %fdiv = fdiv float 1.000000e+00, %a + store float %fdiv, float addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt: + +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; PREGFX10-NOT: s_setreg +; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; PREGFX10-NOT: s_setreg + +; GFX10-NOT: s_denorm_mode +; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] +; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] +; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] +; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]] +; GFX10-NOT: s_denorm_mode + +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], +define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 { +entry: + %fdiv = fdiv float 1.000000e+00, %a + store float %fdiv, float addrspace(1)* %out + ret void +} + + attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" } attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" } Index: llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -348,7 +348,7 @@ ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 - %div = fdiv fast float 1.000000e+00, %load + %div = fdiv fast float 1.000000e+00, %load, !fpmath !0 store float %div, float addrspace(1)* %arg, align 4 ret void } @@ -359,7 +359,7 @@ ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 - %div = fdiv fast float -1.000000e+00, %load + %div = fdiv fast float -1.000000e+00, %load, !fpmath !0 store float %div, float addrspace(1)* %arg, align 4 ret void } @@ -370,7 +370,7 @@ ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 - %neg = fsub float -0.000000e+00, %load + %neg = fsub float -0.000000e+00, %load, !fpmath !0 %div = fdiv fast float 1.000000e+00, %neg store float %div, float addrspace(1)* %arg, align 4 ret void @@ -382,22 +382,18 @@ ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 - %neg = fsub float -0.000000e+00, %load + %neg = fsub float -0.000000e+00, %load, !fpmath !0 %div = fdiv fast float -1.000000e+00, %neg store float %div, float addrspace(1)* %arg, align 4 ret void } ; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded: -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM: v_div_fmas_f32 -; GCN-DENORM: v_div_fixup_f32 - -; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] -; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN-DAG: v_div_scale_f32 +; GCN-DAG: v_rcp_f32_e32 +; GCN-DAG: v_div_scale_f32 +; GCN: v_div_fmas_f32 +; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %div = fdiv float 1.000000e+00, %load @@ -406,15 +402,11 @@ } ; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded: -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM: v_div_fmas_f32 -; GCN-DENORM: v_div_fixup_f32 - -; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] -; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN-DAG: v_div_scale_f32 +; GCN-DAG: v_rcp_f32_e32 +; GCN-DAG: v_div_scale_f32 +; GCN: v_div_fmas_f32 +; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %div = fdiv float -1.000000e+00, %load @@ -423,15 +415,11 @@ } ; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded: -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM: v_div_fmas_f32 -; GCN-DENORM: v_div_fixup_f32 - -; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] -; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN-DAG: v_div_scale_f32 +; GCN-DAG: v_rcp_f32_e32 +; GCN-DAG: v_div_scale_f32 +; GCN: v_div_fmas_f32 +; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %neg = fsub float -0.000000e+00, %load @@ -441,15 +429,11 @@ } ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded: -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM: v_div_fmas_f32 -; GCN-DENORM: v_div_fixup_f32 - -; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] -; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN-DAG: v_div_scale_f32 +; GCN-DAG: v_rcp_f32_e32 +; GCN-DAG: v_div_scale_f32 +; GCN: v_div_fmas_f32 +; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %neg = fsub float -0.000000e+00, %load Index: llvm/test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -219,13 +219,30 @@ ; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]], ; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]] ; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]] +define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { +.entry: + %tmp7 = fdiv float 1.000000e+00, %tmp6 + %tmp8 = fmul float 0.000000e+00, %tmp7 + %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 + %.i188 = fadd float %tmp9, 0.000000e+00 + %tmp10 = fcmp uge float %.i188, %tmp2 + %tmp11 = fsub float -0.000000e+00, %.i188 + %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 + %tmp12 = fcmp ule float %.i092, 0.000000e+00 + %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 + ret float %.i198 +} + +; This is a workaround because -enable-no-signed-zeros-fp-math does not set up +; function attribute unsafe-fp-math automatically. Combine with the previous test +; when that is done. +; GCN-LABEL: {{^}}fneg_fadd_0_nsz: ; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]], ; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]], ; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], ; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], ; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]] - -define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { +define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { .entry: %tmp7 = fdiv float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 @@ -2524,3 +2541,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "unsafe-fp-math"="true" } Index: llvm/test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -11,7 +11,7 @@ ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_med3_f32 v0, |v0|, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %known.not.snan = call float @llvm.fabs.f32(float %a.nnan.add) %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) @@ -22,10 +22,10 @@ ; GCN-LABEL: v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, -v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %known.not.snan = fsub float -0.0, %a.nnan.add %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) @@ -71,7 +71,7 @@ ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %known.not.snan = call float @llvm.copysign.f32(float %a.nnan.add, float %sign) %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) @@ -101,7 +101,7 @@ ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %b.nnan.add = fadd nnan float %b, 1.0 %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b.nnan.add) %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) @@ -166,7 +166,7 @@ ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b) %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) @@ -182,7 +182,7 @@ ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %b.nnan.add = fadd nnan float %b, 1.0 %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b.nnan.add) %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) @@ -215,7 +215,7 @@ ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b) %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) @@ -232,7 +232,7 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %b.nnan.add = fadd nnan float %b, 1.0 %cmp = icmp eq i32 %c, 0 %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b.nnan.add @@ -269,7 +269,7 @@ ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %a.nnan.add = fdiv nnan float 1.0, %a + %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %cmp = icmp eq i32 %c, 0 %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) @@ -669,3 +669,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } + +!0 = !{float 2.500000e+00} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -37,7 +37,7 @@ ; SI-NOT: [[RESULT]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 { - %rcp = fdiv float 1.0, %src + %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void } @@ -47,7 +47,7 @@ ; SI-NOT: [[RESULT]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 { - %rcp = fdiv float 1.0, %src + %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void } @@ -61,8 +61,7 @@ } ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32: -; SI: v_sqrt_f32_e32 -; SI: v_rcp_f32_e32 +; SI: v_rsq_f32_e32 define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { %sqrt = call float @llvm.sqrt.f32(float %src) %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) @@ -144,3 +143,5 @@ attributes #2 = { nounwind "unsafe-fp-math"="true" "target-features"="-fp32-denormals" } attributes #3 = { nounwind "unsafe-fp-math"="false" "target-features"="+fp32-denormals" } attributes #4 = { nounwind "unsafe-fp-math"="true" "target-features"="+fp32-denormals" } + +!0 = !{float 2.500000e+00} Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -112,7 +112,7 @@ bb19: ; preds = %bb %tmp20 = uitofp i32 %arg6 to float - %tmp21 = fdiv float 1.000000e+00, %tmp20 + %tmp21 = fdiv float 1.000000e+00, %tmp20, !fpmath !0 %tmp22 = and i32 %arg6, 16777215 br label %bb23 @@ -258,3 +258,5 @@ attributes #0 = { nounwind willreturn } attributes #1 = { nounwind readnone speculatable } + +!0 = !{float 2.500000e+00} Index: llvm/test/CodeGen/AMDGPU/rcp-pattern.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -10,7 +10,7 @@ ; EG: RECIP_IEEE define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { - %rcp = fdiv float 1.0, %src + %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void } @@ -71,7 +71,7 @@ ; EG: RECIP_IEEE define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) - %rcp = fdiv float 1.0, %src.fabs + %rcp = fdiv float 1.0, %src.fabs, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void } @@ -83,7 +83,7 @@ ; EG: RECIP_IEEE define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { - %rcp = fdiv float -1.0, %src + %rcp = fdiv float -1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void } @@ -95,7 +95,7 @@ define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fsub float -0.0, %src.fabs - %rcp = fdiv float 1.0, %src.fabs.fneg + %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void } @@ -109,7 +109,7 @@ define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fsub float -0.0, %src.fabs - %rcp = fdiv float 1.0, %src.fabs.fneg + %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 store volatile float %rcp, float addrspace(1)* %out, align 4 %other = fmul float %src, %src.fabs.fneg Index: llvm/test/CodeGen/AMDGPU/rcp_iflag.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rcp_iflag.ll +++ llvm/test/CodeGen/AMDGPU/rcp_iflag.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) { %load = load i32, i32 addrspace(1)* %in, align 4 %cvt = uitofp i32 %load to float - %div = fdiv float 1.000000e+00, %cvt + %div = fdiv float 1.000000e+00, %cvt, !fpmath !0 store float %div, float addrspace(1)* %out, align 4 ret void } @@ -15,7 +15,9 @@ define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) { %load = load i32, i32 addrspace(1)* %in, align 4 %cvt = sitofp i32 %load to float - %div = fdiv float 1.000000e+00, %cvt + %div = fdiv float 1.000000e+00, %cvt, !fpmath !0 store float %div, float addrspace(1)* %out, align 4 ret void } + +!0 = !{float 2.500000e+00} Index: llvm/test/CodeGen/AMDGPU/rsq.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.ll +++ llvm/test/CodeGen/AMDGPU/rsq.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt + %div = fdiv float 1.0, %sqrt, !fpmath !0 store float %div, float addrspace(1)* %out, align 4 ret void } @@ -33,7 +33,7 @@ ; SI: s_endpgm define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt + %div = fdiv float 1.0, %sqrt, !fpmath !0 store float %div, float addrspace(1)* %out, align 4 ret void } @@ -78,13 +78,13 @@ ; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] ; SI-SAFE: buffer_store_dword [[RSQ]] -; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}} -; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] -; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] +; SI-UNSAFE: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], v{{[0-9]+}} +; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] +; SI-UNSAFE: buffer_store_dword [[RSQ]] define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) - %div = fdiv float -1.0, %sqrt + %div = fdiv float -1.0, %sqrt, !fpmath !0 store float %div, float addrspace(1)* %out, align 4 ret void } @@ -109,14 +109,14 @@ ; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] ; SI-SAFE: buffer_store_dword [[RSQ]] -; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}} -; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] -; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] +; SI-UNSAFE: v_sqrt_f32_e64 [[SQRT:v[0-9]+]], -v{{[0-9]+}} +; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] +; SI-UNSAFE: buffer_store_dword [[RSQ]] define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %val.fneg = fsub float -0.0, %val %sqrt = call float @llvm.sqrt.f32(float %val.fneg) - %div = fdiv float -1.0, %sqrt + %div = fdiv float -1.0, %sqrt, !fpmath !0 store float %div, float addrspace(1)* %out, align 4 ret void } @@ -136,3 +136,5 @@ store double %div, double addrspace(1)* %out, align 4 ret void } + +!0 = !{float 2.500000e+00}