This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Tune perfhint analysis to account access width
ClosedPublic

Authored by rampitec on Jul 8 2021, 12:31 PM.

Download Raw Diff

Details

Reviewers

yaxunl
arsenm
foad

Commits

rGa397c1c82f1c: [AMDGPU] Tune perfhint analysis to account access width

Summary

A function with less memory instructions but wider access
is the same as a function with more but narrower accesses
in terms of memory boundness. In fact the pass would give
different answers before and after vectorization without
this change.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

rampitec created this revision.Jul 8 2021, 12:31 PM

Herald added subscribers: foad, kerbowa, hiraditya and 6 others. · View Herald TranscriptJul 8 2021, 12:31 PM

rampitec requested review of this revision.Jul 8 2021, 12:31 PM

Herald added a project: Restricted Project. · View Herald TranscriptJul 8 2021, 12:31 PM

Herald added a subscriber: wdng. · View Herald Transcript

rampitec mentioned this in D105652: [AMDGPU] Move perfhint analysis.Jul 8 2021, 12:43 PM

rampitec added a child revision: D105652: [AMDGPU] Move perfhint analysis.

Harbormaster completed remote builds in B113063: Diff 357320.Jul 8 2021, 1:57 PM

• post.kadirselcuk added a child revision: D34362: [LNT] Support for different DataSet usage in Polybench for "lnt runtest nt".Jul 10 2021, 5:55 PM

Rebased.

rampitec added a reviewer: foad.Jul 14 2021, 11:20 AM

Harbormaster completed remote builds in B114037: Diff 358671.Jul 14 2021, 12:25 PM

foad added inline comments.Jul 15 2021, 3:36 AM

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
222–223	The "InstCount" names seem like a lie since we are no longer counting the number of instructions, but I don't have a better idea.

rampitec added inline comments.Jul 15 2021, 9:14 AM

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
222–223	Yes, I also didn't come up with a better name. In fact it always was a lie since IR instruction is not a HW instruction. All of that was inspired by the desire to move pass later in the pipeline and then I realized to preserve its behavior I need to adjust it for LD/ST combining. But the metric name has drifred even more from reality as it used to be.

yaxunl added inline comments.Jul 15 2021, 10:03 AM

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
222–223	How about Count => Cost ? Also, we need to do the same change to IAMInstCoount and LSMInstCount to keep the way to calculate the cost consistent.

rampitec added inline comments.Jul 15 2021, 10:25 AM

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
222–223	Thanks, cost sounds better.

Renamed variables to read 'cost' instead of 'count'.
Adjusted IAM and LSM costs accordingly.

I still think this really should be a machine analysis

In D105651#2881154, @arsenm wrote:

I still think this really should be a machine analysis

Probably, but that's really a completely separate change.

Harbormaster completed remote builds in B114322: Diff 359087.Jul 15 2021, 2:15 PM

efriedma removed a child revision: D34362: [LNT] Support for different DataSet usage in Polybench for "lnt runtest nt".Jul 17 2021, 3:02 PM

ping

LGTM. Thanks.

This revision is now accepted and ready to land.Jul 21 2021, 12:05 PM

This revision was landed with ongoing or failed builds.Jul 21 2021, 12:46 PM

Closed by commit rGa397c1c82f1c: [AMDGPU] Tune perfhint analysis to account access width (authored by rampitec). · Explain Why

This revision was automatically updated to reflect the committed changes.

rampitec added a commit: rGa397c1c82f1c: [AMDGPU] Tune perfhint analysis to account access width.

rampitec mentioned this in rGd01b34ed3146: [AMDGPU] Move perfhint analysis.Jul 21 2021, 1:08 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUPerfHintAnalysis.h

11 lines

AMDGPUPerfHintAnalysis.cpp

42 lines

test/

CodeGen/

AMDGPU/

perfhint.ll

10 lines

Diff 360564

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h

Show All 31 Lines	void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();		AU.setPreservesAll();
}		}

bool isMemoryBound(const Function *F) const;		bool isMemoryBound(const Function *F) const;

bool needsWaveLimiter(const Function *F) const;		bool needsWaveLimiter(const Function *F) const;

struct FuncInfo {		struct FuncInfo {
unsigned MemInstCount;		unsigned MemInstCost;
unsigned InstCount;		unsigned InstCost;
unsigned IAMInstCount; // Indirect access memory instruction count		unsigned IAMInstCost; // Indirect access memory instruction count
unsigned LSMInstCount; // Large stride memory instruction count		unsigned LSMInstCost; // Large stride memory instruction count
FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),		FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
LSMInstCount(0) {}
};		};

typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;		typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;

private:		private:

FuncInfoMap FIM;		FuncInfoMap FIM;
};		};
} // namespace llvm		} // namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H		#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp

	Show First 20 Lines • Show All 203 Lines • ▼ Show 20 Lines
	AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {			AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
	AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];			AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];

	LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');			LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');

	for (auto &B : F) {			for (auto &B : F) {
	LastAccess = MemAccessInfo();			LastAccess = MemAccessInfo();
	for (auto &I : B) {			for (auto &I : B) {
	if (getMemoryInstrPtr(&I)) {			if (const Value *Ptr = getMemoryInstrPtr(&I)) {
				unsigned Size = divideCeil(
				Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
				32);
	if (isIndirectAccess(&I))			if (isIndirectAccess(&I))
	++FI.IAMInstCount;			FI.IAMInstCost += Size;
	if (isLargeStride(&I))			if (isLargeStride(&I))
	++FI.LSMInstCount;			FI.LSMInstCost += Size;
	++FI.MemInstCount;			FI.MemInstCost += Size;
	++FI.InstCount;			FI.InstCost += Size;
	continue;			continue;
	}			}
				foadUnsubmitted Done Reply Inline Actions The "InstCount" names seem like a lie since we are no longer counting the number of instructions, but I don't have a better idea. foad: The "InstCount" names seem like a lie since we are no longer counting the number of…
				rampitecAuthorUnsubmitted Done Reply Inline Actions Yes, I also didn't come up with a better name. In fact it always was a lie since IR instruction is not a HW instruction. All of that was inspired by the desire to move pass later in the pipeline and then I realized to preserve its behavior I need to adjust it for LD/ST combining. But the metric name has drifred even more from reality as it used to be. rampitec: Yes, I also didn't come up with a better name. In fact it always was a lie since IR instruction…
				yaxunlUnsubmitted Done Reply Inline Actions How about Count => Cost ? Also, we need to do the same change to IAMInstCoount and LSMInstCount to keep the way to calculate the cost consistent. yaxunl: How about Count => Cost ? Also, we need to do the same change to IAMInstCoount and…
				rampitecAuthorUnsubmitted Done Reply Inline Actions Thanks, cost sounds better. rampitec: Thanks, cost sounds better.
	if (auto *CB = dyn_cast<CallBase>(&I)) {			if (auto *CB = dyn_cast<CallBase>(&I)) {
	Function *Callee = CB->getCalledFunction();			Function *Callee = CB->getCalledFunction();
	if (!Callee \|\| Callee->isDeclaration()) {			if (!Callee \|\| Callee->isDeclaration()) {
	++FI.InstCount;			++FI.InstCost;
	continue;			continue;
	}			}
	if (&F == Callee) // Handle immediate recursion			if (&F == Callee) // Handle immediate recursion
	continue;			continue;

	auto Loc = FIM.find(Callee);			auto Loc = FIM.find(Callee);
	if (Loc == FIM.end())			if (Loc == FIM.end())
	continue;			continue;

	FI.MemInstCount += Loc->second.MemInstCount;			FI.MemInstCost += Loc->second.MemInstCost;
	FI.InstCount += Loc->second.InstCount;			FI.InstCost += Loc->second.InstCost;
	FI.IAMInstCount += Loc->second.IAMInstCount;			FI.IAMInstCost += Loc->second.IAMInstCost;
	FI.LSMInstCount += Loc->second.LSMInstCount;			FI.LSMInstCost += Loc->second.LSMInstCost;
	} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {			} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
	TargetLoweringBase::AddrMode AM;			TargetLoweringBase::AddrMode AM;
	auto Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, DL);			auto Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, DL);
	AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));			AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
	AM.HasBaseReg = !AM.BaseGV;			AM.HasBaseReg = !AM.BaseGV;
	if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),			if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
	GEP->getPointerAddressSpace()))			GEP->getPointerAddressSpace()))
	// Offset will likely be folded into load or store			// Offset will likely be folded into load or store
	continue;			continue;
	++FI.InstCount;			++FI.InstCost;
	} else {			} else {
	++FI.InstCount;			++FI.InstCost;
	}			}
	}			}
	}			}

	return &FI;			return &FI;
	}			}

	bool AMDGPUPerfHint::runOnFunction(Function &F) {			bool AMDGPUPerfHint::runOnFunction(Function &F) {
	const Module &M = *F.getParent();			const Module &M = *F.getParent();
	DL = &M.getDataLayout();			DL = &M.getDataLayout();

	if (F.hasFnAttribute("amdgpu-wave-limiter") &&			if (F.hasFnAttribute("amdgpu-wave-limiter") &&
	F.hasFnAttribute("amdgpu-memory-bound"))			F.hasFnAttribute("amdgpu-memory-bound"))
	return false;			return false;

	const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);			const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);

	LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount			LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
	<< '\n'			<< '\n'
	<< " IAMInst: " << Info->IAMInstCount << '\n'			<< " IAMInst cost: " << Info->IAMInstCost << '\n'
	<< " LSMInst: " << Info->LSMInstCount << '\n'			<< " LSMInst cost: " << Info->LSMInstCost << '\n'
	<< " TotalInst: " << Info->InstCount << '\n');			<< " TotalInst cost: " << Info->InstCost << '\n');

	if (isMemBound(*Info)) {			if (isMemBound(*Info)) {
	LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");			LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
	NumMemBound++;			NumMemBound++;
	F.addFnAttr("amdgpu-memory-bound", "true");			F.addFnAttr("amdgpu-memory-bound", "true");
	}			}

	if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {			if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
	LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");			LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
	NumLimitWave++;			NumLimitWave++;
	F.addFnAttr("amdgpu-wave-limiter", "true");			F.addFnAttr("amdgpu-wave-limiter", "true");
	}			}

	return true;			return true;
	}			}

	bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {			bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
	return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;			return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
	}			}

	bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {			bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
	return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +			return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
	FI.LSMInstCount * LSWeight) *			FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
	100 / FI.InstCount) > LimitWaveThresh;
	}			}

	bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {			bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
	if (auto PT = dyn_cast<PointerType>(V->getType())) {			if (auto PT = dyn_cast<PointerType>(V->getType())) {
	unsigned As = PT->getAddressSpace();			unsigned As = PT->getAddressSpace();
	// Flat likely points to global too.			// Flat likely points to global too.
	return As == AMDGPUAS::GLOBAL_ADDRESS \|\| As == AMDGPUAS::FLAT_ADDRESS;			return As == AMDGPUAS::GLOBAL_ADDRESS \|\| As == AMDGPUAS::FLAT_ADDRESS;
	}			}
	▲ Show 20 Lines • Show All 98 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/perfhint.ll

Show All 10 Lines	bb:
%tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16		%tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16
%tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2		%tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2
store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16		store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16
%tmp6 = add nuw nsw i64 %tmp2, 1		%tmp6 = add nuw nsw i64 %tmp2, 1
%tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6		%tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6
%tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16		%tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
%tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6		%tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16		store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
%tmp10 = add nuw nsw i64 %tmp2, 2
%tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
%tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
%tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
%tmp14 = add nuw nsw i64 %tmp2, 3
%tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
%tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
%tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
ret void		ret void
}		}

; GCN-LABEL: {{^}}test_large_stride:		; GCN-LABEL: {{^}}test_large_stride:
; GCN: MemoryBound: 0		; GCN: MemoryBound: 0
; GCN: WaveLimiterHint : 1		; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_large_stride(i32 addrspace(1)* nocapture %arg) {		define amdgpu_kernel void @test_large_stride(i32 addrspace(1)* nocapture %arg) {
bb:		bb:
▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines