This is an archive of the discontinued LLVM Phabricator instance.

Differential D20556

AMDGPU: Skip waiting on lgkmcnt for global flat loads
AbandonedPublic

Authored by arsenm on May 23 2016, 9:18 PM.

Download Raw Diff

Details

Reviewers

• tstellarAMD

Summary

If we know the access isn't to a flat address,
the wait for LDS is not necessary.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 58192.May 23 2016, 9:18 PM

arsenm retitled this revision from to AMDGPU: Skip waiting on lgkmcnt for global flat loads.

arsenm updated this object.

arsenm added a reviewer: • tstellarAMD.

arsenm added a subscriber: llvm-commits.

Herald added subscribers: kzhuravl, arsenm. · View Herald TranscriptMay 23 2016, 9:18 PM

• tstellarAMD added inline comments.May 24 2016, 5:36 PM

lib/Target/AMDGPU/SIInsertWaits.cpp
221–225	I'm not really sure exactly what this is doing, but as long as this accounts for the fact that the hw LGKM counter is always incremented even if the operation accesses global memory than this is fine. Though, I think you should add some tests that have lds operations before and after a flat instruction that accesses global memory.

arsenm added inline comments.May 24 2016, 5:39 PM

lib/Target/AMDGPU/SIInsertWaits.cpp
221–225	I don't think this is accounting for the hardware increase

This pass was replaced

Herald added subscribers: kerbowa, t-tye, tpr and 5 others. · View Herald TranscriptFeb 18 2020, 7:29 AM

t-tye added inline comments.Feb 18 2020, 9:38 AM

lib/Target/AMDGPU/SIInsertWaits.cpp
221–225	But does the hardware increasing the LGKM counter matter? The hardware will increase it, then decrease it once it determines the FLT address is targeting LDS. So all that that can effect is another memory operation waiting for LGKM, causing them to wait a bit longer. It cannot make any other memory operation satisfy their WAITCNT early so cannot break correctness. The completion of a FLAT operation that is known to only target VMEM only needs to wait on the vmem counter.

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIInsertWaits.cpp

15 lines

test/

CodeGen/

AMDGPU/

waitcnt-flat.ll

44 lines

Diff 58192

lib/Target/AMDGPU/SIInsertWaits.cpp

Show First 20 Lines • Show All 173 Lines • ▼ Show 20 Lines
static bool readsVCCZ(unsigned Opcode) {		static bool readsVCCZ(unsigned Opcode) {
return Opcode == AMDGPU::S_CBRANCH_VCCNZ \|\| Opcode == AMDGPU::S_CBRANCH_VCCZ;		return Opcode == AMDGPU::S_CBRANCH_VCCNZ \|\| Opcode == AMDGPU::S_CBRANCH_VCCZ;
}		}

bool SIInsertWaits::hasOutstandingLGKM() const {		bool SIInsertWaits::hasOutstandingLGKM() const {
return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;		return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
}		}

		static bool hasGlobalMemOperand(const MachineInstr &MI) {
		if (!MI.hasOneMemOperand())
		return false;

		MachineMemOperand MMO = MI.memoperands_begin();
		unsigned AS = MMO->getAddrSpace();
		return AS == AMDGPUAS::GLOBAL_ADDRESS \|\|
		AS == AMDGPUAS::CONSTANT_ADDRESS;
		}

Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {		Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
uint64_t TSFlags = MI.getDesc().TSFlags;		uint64_t TSFlags = MI.getDesc().TSFlags;
Counters Result = { { 0, 0, 0 } };		Counters Result = { { 0, 0, 0 } };

Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);		Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);

// Only consider stores or EXP for EXP_CNT		// Only consider stores or EXP for EXP_CNT
Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&		Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
Show All 13 Lines	if (TII->isSMRD(MI)) {
unsigned Size = RC->getSize();		unsigned Size = RC->getSize();
Result.Named.LGKM = Size > 4 ? 2 : 1;		Result.Named.LGKM = Size > 4 ? 2 : 1;
} else {		} else {
// s_dcache_inv etc. do not have a a destination register. Assume we		// s_dcache_inv etc. do not have a a destination register. Assume we
// want a wait on these.		// want a wait on these.
// XXX - What is the right value?		// XXX - What is the right value?
Result.Named.LGKM = 1;		Result.Named.LGKM = 1;
}		}
		} else if (TII->isFLAT(MI)) {
		// If we know the pointer is not accessing a flat address, we don't need
		// to wait for lgkm.
		if (!hasGlobalMemOperand(MI))
		Result.Named.LGKM = 1;
		tstellarAMDUnsubmitted Not Done Reply Inline Actions I'm not really sure exactly what this is doing, but as long as this accounts for the fact that the hw LGKM counter is always incremented even if the operation accesses global memory than this is fine. Though, I think you should add some tests that have lds operations before and after a flat instruction that accesses global memory. tstellarAMD: I'm not really sure exactly what this is doing, but as long as this accounts for the fact that…
		arsenmAuthorUnsubmitted Not Done Reply Inline Actions I don't think this is accounting for the hardware increase arsenm: I don't think this is accounting for the hardware increase
		t-tyeUnsubmitted Not Done Reply Inline Actions But does the hardware increasing the LGKM counter matter? The hardware will increase it, then decrease it once it determines the FLT address is targeting LDS. So all that that can effect is another memory operation waiting for LGKM, causing them to wait a bit longer. It cannot make any other memory operation satisfy their WAITCNT early so cannot break correctness. The completion of a FLAT operation that is known to only target VMEM only needs to wait on the vmem counter. t-tye: But does the hardware increasing the LGKM counter matter? The hardware will increase it, then…
} else {		} else {
// DS		// DS
Result.Named.LGKM = 1;		Result.Named.LGKM = 1;
}		}

} else {		} else {
Result.Named.LGKM = 0;		Result.Named.LGKM = 0;
}		}
▲ Show 20 Lines • Show All 402 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/waitcnt-flat.ll

	; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri \| FileCheck --check-prefix=GCN %s			; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri \| FileCheck --check-prefix=GCN %s
	; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji \| FileCheck --check-prefix=GCN %s			; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji \| FileCheck --check-prefix=GCN %s

	; If flat_store_dword and flat_load_dword use different registers for the data			; If flat_store_dword and flat_load_dword use different registers for the data
	; operand, this test is not broken. It just means it is no longer testing			; operand, this test is not broken. It just means it is no longer testing
	; for the original bug.			; for the original bug.

	; GCN: {{^}}test:			; GCN-LABEL: {{^}}global_test:
				; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}],
				; GCN: s_waitcnt vmcnt(0){{$}}
				; GCN: flat_load_dword

				; Test pointer problem
	; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]			; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
	; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
	; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]			; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
	define void @test(i32 addrspace(1)* %out, i32 %in) {			define void @global_test(i32 addrspace(1)* %out, i32 %in) {
	store volatile i32 0, i32 addrspace(1)* %out			store volatile i32 0, i32 addrspace(1)* %out
	%val = load volatile i32, i32 addrspace(1)* %out			%val = load volatile i32, i32 addrspace(1)* %out
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}flat_test:
				; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}],
				; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
				; GCN: flat_load_dword
				define void @flat_test(i32 addrspace(4)* %out, i32 %in) {
				store volatile i32 0, i32 addrspace(4)* %out
				%val = load volatile i32, i32 addrspace(4)* %out
				ret void
				}

				; If the store is not through a generic pointer, the lgkmcnt is not
				; needed.

				; GCN-LABEL: {{^}}global_flat_test:
				; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}],
				; GCN: s_waitcnt vmcnt(0){{$}}
				; GCN: flat_load_dword
				define void @global_flat_test(i32 addrspace(1)* %out, i32 %in) {
				store volatile i32 0, i32 addrspace(1)* %out
				%out.cast = addrspacecast i32 addrspace(1)* %out to i32 addrspace(4)*
				%val = load volatile i32, i32 addrspace(4)* %out.cast
				ret void
				}

				; GCN-LABEL: {{^}}flat_global_test:
				; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}],
				; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
				; GCN: flat_load_dword
				define void @flat_global_test(i32 addrspace(1)* %out, i32 %in) {
				%out.cast = addrspacecast i32 addrspace(1)* %out to i32 addrspace(4)*
				store volatile i32 0, i32 addrspace(4)* %out.cast
				%val = load volatile i32, i32 addrspace(1)* %out
				ret void
				}