Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -620,10 +620,12 @@ return nullptr; Type *Ty = Den->getType(); - Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty); if (const ConstantFP *CLHS = dyn_cast(Num)) { if (AllowInaccurateRcp || RcpIsAccurate) { if (CLHS->isExactlyValue(1.0)) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to @@ -636,10 +638,13 @@ // 1.0 / x -> rcp(x) return Builder.CreateCall(Decl, { Den }); - } + } // Same as for 1.0, but expand the sign out of the constant. - if (CLHS->isExactlyValue(-1.0)) { + if (CLHS->isExactlyValue(-1.0)) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); + // -1.0 / x -> rcp (fneg x) Value *FNeg = Builder.CreateFNeg(Den); return Builder.CreateCall(Decl, { FNeg }); @@ -648,6 +653,9 @@ } if (AllowInaccurateRcp) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); + // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) Value *Recip = Builder.CreateCall(Decl, { Den });