This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Remove pointless libcall recognition of native_{divide|recip}
ClosedPublic

Authored by arsenm on Jul 31 2023, 5:15 AM.

Download Raw Diff

Details

Reviewers

rampitec
vpykhtin
jhuber6
dfukalov
yaxunl

Group Reviewers

Restricted Project

Summary

This was trying to constant fold these calls, and also turn some of
them into a regular fmul/fdiv. There's no point to doing that, the
underlying library implementation should be using those in the first
place. Even when the library does use the rcp intrinsics, the backend
handles constant folding of those. This was also only performing the
folds under overly strict fast-evertyhing-is-required conditions.

The one possible plus this gained over linking in the library is if
you were using all fast math flags, it would propagate them to the new
instructions. We could address this in the library by adding more fast
math flags to the native implementations.

The constant fold case also had no test coverage.

Diff Detail

Event Timeline

arsenm created this revision.Jul 31 2023, 5:15 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 31 2023, 5:15 AM

Herald added subscribers: foad, kerbowa, hiraditya and 4 others. · View Herald Transcript

arsenm requested review of this revision.Jul 31 2023, 5:15 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 31 2023, 5:15 AM

Herald added a subscriber: wdng. · View Herald Transcript

arsenm added a child revision: D156677: AMDGPU: Remove pointless libcall optimization of fma/mad.Jul 31 2023, 5:16 AM

Harbormaster completed remote builds in B249175: Diff 545604.Jul 31 2023, 6:09 AM

Rebase

Harbormaster completed remote builds in B249230: Diff 545676.Jul 31 2023, 8:14 AM

ping

rampitec accepted this revision.Aug 9 2023, 3:35 PM

This revision is now accepted and ready to land.Aug 9 2023, 3:35 PM

arsenm mentioned this in rG6448d5ba581a: AMDGPU: Remove pointless libcall recognition of native_{divide|recip}.Aug 9 2023, 3:48 PM

6448d5ba581a275ddaf9504368690abcf1aec244

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPULibCalls.cpp

58 lines

test/

CodeGen/

AMDGPU/

simplify-libcalls.ll

8 lines

Diff 545676

llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp

Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	private:
FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);		FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);

bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);		bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);

bool TDOFold(CallInst *CI, const FuncInfo &FInfo);		bool TDOFold(CallInst *CI, const FuncInfo &FInfo);

/* Specialized optimizations */		/* Specialized optimizations */

// recip (half or native)
bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);

// divide (half or native)
bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);

// pow/powr/pown		// pow/powr/pown
bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,		bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
const FuncInfo &FInfo);		const FuncInfo &FInfo);

// rootn		// rootn
bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);		bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);

// fma/mad		// fma/mad
▲ Show 20 Lines • Show All 565 Lines • ▼ Show 20 Lines	case AMDGPULibFunc::EI_POWN:
return fold_pow(FPOp, B, FInfo);		return fold_pow(FPOp, B, FInfo);
case AMDGPULibFunc::EI_ROOTN:		case AMDGPULibFunc::EI_ROOTN:
return fold_rootn(FPOp, B, FInfo);		return fold_rootn(FPOp, B, FInfo);
case AMDGPULibFunc::EI_SQRT:		case AMDGPULibFunc::EI_SQRT:
return fold_sqrt(FPOp, B, FInfo);		return fold_sqrt(FPOp, B, FInfo);
case AMDGPULibFunc::EI_COS:		case AMDGPULibFunc::EI_COS:
case AMDGPULibFunc::EI_SIN:		case AMDGPULibFunc::EI_SIN:
return fold_sincos(FPOp, B, FInfo, AA);		return fold_sincos(FPOp, B, FInfo, AA);
case AMDGPULibFunc::EI_RECIP:
// skip vector function
assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE \|\|
FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
"recip must be an either native or half function");
return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo);

case AMDGPULibFunc::EI_DIVIDE:
// skip vector function
assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE \|\|
FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
"divide must be an either native or half function");
return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo);
case AMDGPULibFunc::EI_FMA:		case AMDGPULibFunc::EI_FMA:
case AMDGPULibFunc::EI_MAD:		case AMDGPULibFunc::EI_MAD:
case AMDGPULibFunc::EI_NFMA:		case AMDGPULibFunc::EI_NFMA:
// skip vector function		// skip vector function
return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);		return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);
default:		default:
break;		break;
}		}
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
}		}
}		}
}		}
}		}

return false;		return false;
}		}

// [native_]half_recip(c) ==> 1.0/c
bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
const FuncInfo &FInfo) {
Value *opr0 = CI->getArgOperand(0);
if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
// Just create a normal div. Later, InstCombine will be able
// to compute the divide into a constant (avoid check float infinity
// or subnormal at this point).
Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
opr0,
"recip2div");
LLVM_DEBUG(errs() << "AMDIC: " << CI << " ---> " << nval << "\n");
replaceCall(nval);
return true;
}
return false;
}

// [native_]half_divide(x, c) ==> x/c
bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
const FuncInfo &FInfo) {
Value *opr0 = CI->getArgOperand(0);
Value *opr1 = CI->getArgOperand(1);
ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);

if ((CF0 && CF1) \|\| // both are constants
(CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32)))
// CF1 is constant && f32 divide
{
Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0),
opr1, "__div2recip");
Value *nval = B.CreateFMul(opr0, nval1, "__div2mul");
replaceCall(nval);
return true;
}
return false;
}

namespace llvm {		namespace llvm {
static double log2(double V) {		static double log2(double V) {
#if _XOPEN_SOURCE >= 600 \|\| defined(_ISOC99_SOURCE) \|\| _POSIX_C_SOURCE >= 200112L		#if _XOPEN_SOURCE >= 600 \|\| defined(_ISOC99_SOURCE) \|\| _POSIX_C_SOURCE >= 200112L
return ::log2(V);		return ::log2(V);
#else		#else
return log(V) / numbers::ln2;		return log(V) / numbers::ln2;
#endif		#endif
}		}
▲ Show 20 Lines • Show All 959 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll

	Show First 20 Lines • Show All 150 Lines • ▼ Show 20 Lines
	entry:			entry:
	%call = call fast float @_Z10half_recipf(float 3.000000e+00)			%call = call fast float @_Z10half_recipf(float 3.000000e+00)
	store float %call, ptr addrspace(1) %a, align 4			store float %call, ptr addrspace(1) %a, align 4
	ret void			ret void
	}			}

	declare float @_Z10half_recipf(float)			declare float @_Z10half_recipf(float)

				; Do nothing, the underlying implementation will optimize correctly
				; after inlining.
	; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide			; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
	; GCN: fmul fast float %tmp, 0x3FD5555560000000			; GCN: %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
	define amdgpu_kernel void @test_native_divide(ptr addrspace(1) nocapture %a) {			define amdgpu_kernel void @test_native_divide(ptr addrspace(1) nocapture %a) {
	entry:			entry:
	%tmp = load float, ptr addrspace(1) %a, align 4			%tmp = load float, ptr addrspace(1) %a, align 4
	%call = call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)			%call = call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
	store float %call, ptr addrspace(1) %a, align 4			store float %call, ptr addrspace(1) %a, align 4
	ret void			ret void
	}			}

	declare float @_Z13native_divideff(float, float)			declare float @_Z13native_divideff(float, float)

				; Do nothing, the optimization will naturally happen after inlining.

	; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide			; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
	; GCN: fmul fast float %tmp, 0x3FD5555560000000			; GCN: %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
	define amdgpu_kernel void @test_half_divide(ptr addrspace(1) nocapture %a) {			define amdgpu_kernel void @test_half_divide(ptr addrspace(1) nocapture %a) {
	entry:			entry:
	%tmp = load float, ptr addrspace(1) %a, align 4			%tmp = load float, ptr addrspace(1) %a, align 4
	%call = call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)			%call = call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
	store float %call, ptr addrspace(1) %a, align 4			store float %call, ptr addrspace(1) %a, align 4
	ret void			ret void
	}			}

	▲ Show 20 Lines • Show All 613 Lines • Show Last 20 Lines