This is an archive of the discontinued LLVM Phabricator instance.

AMDGPUPerfHintAnalysis: don't treat constant loads as large stride
AcceptedPublic

Authored by foad on Jul 13 2020, 6:13 AM.

Download Raw Diff

Details

Reviewers

rampitec
nhaehnle
arsenm

Summary

In the "large stride" heuristic ignore loads from the constant address
space (as well as local address space). K$ behavior is very different
from L0$ so it doesn't make much sense to use the same heuristic for
them.

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	130 ms	windows > LLVM.CodeGen/AMDGPU::Unknown Unit Message ("")

Event Timeline

foad created this revision.Jul 13 2020, 6:13 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 13 2020, 6:13 AM

Herald added subscribers: llvm-commits, kerbowa, hiraditya and 7 others. · View Herald Transcript

Harbormaster completed remote builds in B63946: Diff 277404.Jul 13 2020, 6:47 AM

Testcase?

LGTM, but needs a test.

Add a test case.

foad added a parent revision: D83862: [AMDGPU] Add missing test prefixes.Jul 15 2020, 5:29 AM

Harbormaster failed remote builds in B64334: Diff 278156!Jul 15 2020, 6:05 AM

This is LGTM, but needs test fix first of course.

This revision is now accepted and ready to land.Jul 15 2020, 11:45 AM

Typo Perh in commit message

foad retitled this revision from AMDGPUPerhHintAnalysis: don't treat constant loads as large stride to AMDGPUPerfHintAnalysis: don't treat constant loads as large stride.Jul 15 2020, 11:55 AM

foad added a parent revision: D122804: [AMDGPU] Only count global-to-global as indirect accesses.Mar 31 2022, 5:56 AM

Herald added a project: Restricted Project. · View Herald TranscriptMar 31 2022, 5:56 AM

Herald added a subscriber: hsmhsm. · View Herald Transcript

foad mentioned this in D122804: [AMDGPU] Only count global-to-global as indirect accesses.Mar 31 2022, 5:58 AM

arsenm added inline comments.Mar 31 2022, 5:59 AM

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
345–346	This isn't really a good implementation. We use scalar loads in more cases and constant address space isn't a guarantee of SMEM loads

This is LGTM, but needs test fix first of course.

I think the test is OK now I have rebased on D122804?

uabelho added a subscriber: uabelho.Apr 5 2022, 12:15 AM

LGTM I guess, but isConstantAddr really should be fixed. CONSTANT_ADDRESS isn't sufficient or even that helpful for knowing if this will use scalar loads

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUPerfHintAnalysis.cpp

4 lines

test/

CodeGen/

AMDGPU/

perfhint.ll

20 lines

Diff 278156

llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp

	Show First 20 Lines • Show All 327 Lines • ▼ Show 20 Lines
	AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {			AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
	MemAccessInfo MAI;			MemAccessInfo MAI;
	const Value *MO = getMemoryInstrPtr(Inst);			const Value *MO = getMemoryInstrPtr(Inst);

	LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');			LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
	// Do not treat local-addr memory access as large stride.			// Do not treat local-addr memory access as large stride.
	if (isLocalAddr(MO))			if (isLocalAddr(MO))
	return MAI;			return MAI;
				// Do not treat constant-addr memory access as large stride because K$
				// behavior is very different from L0$.
				if (isConstantAddr(MO))
				return MAI;

	MAI.V = MO;			MAI.V = MO;
	MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);			MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
	return MAI;			return MAI;
	}			}

	bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {			bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
	if (auto PT = dyn_cast<PointerType>(V->getType())) {			if (auto PT = dyn_cast<PointerType>(V->getType())) {
	unsigned As = PT->getAddressSpace();			unsigned As = PT->getAddressSpace();
	return As == AMDGPUAS::CONSTANT_ADDRESS \|\|			return As == AMDGPUAS::CONSTANT_ADDRESS \|\|
	As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;			As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
	arsenmUnsubmitted Not Done Reply Inline Actions This isn't really a good implementation. We use scalar loads in more cases and constant address space isn't a guarantee of SMEM loads arsenm: This isn't really a good implementation. We use scalar loads in more cases and constant address…
	}			}
	return false;			return false;
	}			}

	bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(			bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
	MemAccessInfo &Reference) const {			MemAccessInfo &Reference) const {

	if (!Base \|\| !Reference.Base \|\| Base != Reference.Base)			if (!Base \|\| !Reference.Base \|\| Base != Reference.Base)
	▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/perfhint.ll

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	bb:
store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4		store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4
%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12288		%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12288
%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4		%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
%tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3		%tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4		store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4
ret void		ret void
}		}

		; GCN-LABEL: {{^}}test_constant_not_large_stride:
		; GCN: MemoryBound: 0
		; GCN: WaveLimiterHint : 0
		define amdgpu_kernel void @test_constant_not_large_stride(i32 addrspace(4)* nocapture %arg, i32 addrspace(1)* nocapture %arg1) {
		bb:
		%tmp = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 4096
		%tmp1 = load i32, i32 addrspace(4)* %tmp, align 4
		%tmp3 = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 8192
		%tmp4 = load i32, i32 addrspace(4)* %tmp3, align 4
		%tmp6 = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 12288
		%tmp7 = load i32, i32 addrspace(4)* %tmp6, align 4
		%tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 1
		store i32 %tmp1, i32 addrspace(1)* %tmp2, align 4
		%tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 2
		store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4
		%tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 3
		store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4
		ret void
		}

; GCN-LABEL: {{^}}test_indirect:		; GCN-LABEL: {{^}}test_indirect:
; GCN: MemoryBound: 0		; GCN: MemoryBound: 0
; GCN: WaveLimiterHint : 1		; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) {		define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) {
bb:		bb:
%tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1		%tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
%tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2		%tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
%tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3		%tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines