This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][Waitcnt] Fix handling of flat instrs
ClosedPublic

Authored by msearles on May 8 2018, 5:43 PM.

Download Raw Diff

Details

Reviewers

arsenm
rampitec

Commits

rGf0b93f1e9e64: [AMDGPU][Waitcnt] Fix handling of flat instrs
rL333926: [AMDGPU][Waitcnt] Fix handling of flat instrs

Summary

On GFX9 and earlier, flat memory ops may decrement VMCNT out-of-order as well as LGKMCNT out-of-order.

Diff Detail

Repository: rL LLVM

Event Timeline

msearles created this revision.May 8 2018, 5:43 PM

Herald added subscribers: t-tye, tpr, dstuttard and 4 others. · View Herald TranscriptMay 8 2018, 5:43 PM

t-tye added inline comments.May 8 2018, 6:40 PM

lib/Target/AMDGPU/SIInsertWaitcnts.cpp
744 ↗	(On Diff #145829)	Update comment to: // If there is a pending FLAT operation, this is a VM or // LGKM waitcnt, and the target can report early // completion, then we need to force a waitcnt 0.}

Update comment as suggested by reviewer.

arsenm added inline comments.May 10 2018, 4:53 AM

test/CodeGen/AMDGPU/waitcnt.mir
33–37 ↗	(On Diff #145838)	Since this is a change for a specific subtarget, should this test have multiple sets of check lines instead of just changing these

Add subtarget-specific checks

ping

LGTM

This revision is now accepted and ready to land.May 30 2018, 11:53 AM

Closed by commit rL333926: [AMDGPU][Waitcnt] Fix handling of flat instrs (authored by msearles). · Explain WhyJun 4 2018, 9:56 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

AMDGPUSubtarget.h

4 lines

SIInsertWaitcnts.cpp

16 lines

test/

CodeGen/

AMDGPU/

waitcnt.mir

15 lines

Diff 149793

llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h

Show First 20 Lines • Show All 466 Lines • ▼ Show 20 Lines	public:
bool hasFlatGlobalInsts() const {		bool hasFlatGlobalInsts() const {
return FlatGlobalInsts;		return FlatGlobalInsts;
}		}

bool hasFlatScratchInsts() const {		bool hasFlatScratchInsts() const {
return FlatScratchInsts;		return FlatScratchInsts;
}		}

		bool hasFlatLgkmVMemCountInOrder() const {
		return getGeneration() > GFX9;
		}

bool hasD16LoadStore() const {		bool hasD16LoadStore() const {
return getGeneration() >= GFX9;		return getGeneration() >= GFX9;
}		}

/// Return if most LDS instructions have an m0 use that require m0 to be		/// Return if most LDS instructions have an m0 use that require m0 to be
/// iniitalized.		/// iniitalized.
bool ldsRequiresM0Init() const {		bool ldsRequiresM0Init() const {
return getGeneration() < GFX9;		return getGeneration() < GFX9;
▲ Show 20 Lines • Show All 501 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Show First 20 Lines • Show All 130 Lines • ▼ Show 20 Lines
// of each wait counter, and a per-register scoreboard for each wait counter.		// of each wait counter, and a per-register scoreboard for each wait counter.
// We also maintain the latest score for every event type that can change the		// We also maintain the latest score for every event type that can change the
// waitcnt in order to know if there are multiple types of events within		// waitcnt in order to know if there are multiple types of events within
// the brackets. When multiple types of event happen in the bracket,		// the brackets. When multiple types of event happen in the bracket,
// wait count may get decreased out of order, therefore we need to put in		// wait count may get decreased out of order, therefore we need to put in
// "s_waitcnt 0" before use.		// "s_waitcnt 0" before use.
class BlockWaitcntBrackets {		class BlockWaitcntBrackets {
public:		public:
BlockWaitcntBrackets() {		BlockWaitcntBrackets(const SISubtarget *SubTarget) : ST(SubTarget) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;		for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {		T = (enum InstCounterType)(T + 1)) {
memset(VgprScores[T], 0, sizeof(VgprScores[T]));		memset(VgprScores[T], 0, sizeof(VgprScores[T]));
}		}
}		}

~BlockWaitcntBrackets() = default;		~BlockWaitcntBrackets() = default;

▲ Show 20 Lines • Show All 161 Lines • ▼ Show 20 Lines	public:
void setMixedExpTypes(bool MixedExpTypesIn) {		void setMixedExpTypes(bool MixedExpTypesIn) {
MixedExpTypes = MixedExpTypesIn;		MixedExpTypes = MixedExpTypesIn;
}		}

void print(raw_ostream &);		void print(raw_ostream &);
void dump() { print(dbgs()); }		void dump() { print(dbgs()); }

private:		private:
		const SISubtarget *ST = nullptr;
bool WaitAtBeginning = false;		bool WaitAtBeginning = false;
bool RevisitLoop = false;		bool RevisitLoop = false;
bool MixedExpTypes = false;		bool MixedExpTypes = false;
int32_t PostOrder = 0;		int32_t PostOrder = 0;
MachineInstr *Waitcnt = nullptr;		MachineInstr *Waitcnt = nullptr;
int32_t ScoreLBs[NUM_INST_CNTS] = {0};		int32_t ScoreLBs[NUM_INST_CNTS] = {0};
int32_t ScoreUBs[NUM_INST_CNTS] = {0};		int32_t ScoreUBs[NUM_INST_CNTS] = {0};
int32_t EventUBs[NUM_WAIT_EVENTS] = {0};		int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
▲ Show 20 Lines • Show All 405 Lines • ▼ Show 20 Lines	if (ScoreToWait == -1) {
return NeedWait;		return NeedWait;
}		}

// If the score of src_operand falls within the bracket, we need an		// If the score of src_operand falls within the bracket, we need an
// s_waitcnt instruction.		// s_waitcnt instruction.
const int32_t LB = getScoreLB(T);		const int32_t LB = getScoreLB(T);
const int32_t UB = getScoreUB(T);		const int32_t UB = getScoreUB(T);
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {		if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if (T == VM_CNT && hasPendingFlat()) {		if ((T == VM_CNT \|\| T == LGKM_CNT) &&
// If there is a pending FLAT operation, and this is a VM waitcnt,		hasPendingFlat() &&
// then we need to force a waitcnt 0 for VM.		!ST->hasFlatLgkmVMemCountInOrder()) {
		// If there is a pending FLAT operation, and this is a VMem or LGKM
		// waitcnt and the target can report early completion, then we need
		// to force a waitcnt 0.
NeedWait = CNT_MASK(T);		NeedWait = CNT_MASK(T);
setScoreLB(T, getScoreUB(T));		setScoreLB(T, getScoreUB(T));
} else if (counterOutOfOrder(T)) {		} else if (counterOutOfOrder(T)) {
// Counter can get decremented out-of-order when there		// Counter can get decremented out-of-order when there
// are multiple types event in the bracket. Also emit an s_wait counter		// are multiple types event in the bracket. Also emit an s_wait counter
// with a conservative value of 0 for the counter.		// with a conservative value of 0 for the counter.
NeedWait = CNT_MASK(T);		NeedWait = CNT_MASK(T);
setScoreLB(T, getScoreUB(T));		setScoreLB(T, getScoreUB(T));
▲ Show 20 Lines • Show All 446 Lines • ▼ Show 20 Lines	if (EmitWaitcnt != 0 \|\| IsForceEmitWaitcnt) {
MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());		MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
if (ContainingLoop) {		if (ContainingLoop) {
MachineBasicBlock *TBB = ContainingLoop->getHeader();		MachineBasicBlock *TBB = ContainingLoop->getHeader();
BlockWaitcntBrackets *ScoreBracket =		BlockWaitcntBrackets *ScoreBracket =
BlockWaitcntBracketsMap[TBB].get();		BlockWaitcntBracketsMap[TBB].get();
if (!ScoreBracket) {		if (!ScoreBracket) {
assert(!BlockVisitedSet.count(TBB));		assert(!BlockVisitedSet.count(TBB));
BlockWaitcntBracketsMap[TBB] =		BlockWaitcntBracketsMap[TBB] =
llvm::make_unique<BlockWaitcntBrackets>();		llvm::make_unique<BlockWaitcntBrackets>(ST);
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();		ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
}		}
ScoreBracket->setRevisitLoop(true);		ScoreBracket->setRevisitLoop(true);
LLVM_DEBUG(dbgs()		LLVM_DEBUG(dbgs()
<< "set-revisit2: Block"		<< "set-revisit2: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);		<< ContainingLoop->getHeader()->getNumber() << '\n';);
}		}
}		}
▲ Show 20 Lines • Show All 662 Lines • ▼ Show 20 Lines	for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
E = RPOT.end(), J = RPOT.begin();		E = RPOT.end(), J = RPOT.begin();
I != E;) {		I != E;) {
MachineBasicBlock &MBB = **I;		MachineBasicBlock &MBB = **I;

BlockVisitedSet.insert(&MBB);		BlockVisitedSet.insert(&MBB);

BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();		BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
if (!ScoreBrackets) {		if (!ScoreBrackets) {
BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();		BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();		ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
}		}
ScoreBrackets->setPostOrder(MBB.getNumber());		ScoreBrackets->setPostOrder(MBB.getNumber());
MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);		MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)		if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();		LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();

// If we are walking into the block from before the loop, then guarantee		// If we are walking into the block from before the loop, then guarantee
▲ Show 20 Lines • Show All 124 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir

	# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waitcnts %s -o - \| FileCheck %s			# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts %s -o - \| FileCheck -check-prefix=GFX89 %s
				# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts %s -o - \| FileCheck -check-prefix=GFX89 %s

	--- \|			--- \|
	define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,			define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
	<4 x i32> addrspace(1)* %global16,			<4 x i32> addrspace(1)* %global16,
	i32* %flat4,			i32* %flat4,
	<4 x i32>* %flat16) {			<4 x i32>* %flat16) {
	ret void			ret void
	}			}
	Show All 15 Lines
	# CHECK: FLAT_LOAD_DWORD			# CHECK: FLAT_LOAD_DWORD
	# CHECK: FLAT_LOAD_DWORDX4			# CHECK: FLAT_LOAD_DWORDX4
	# Global loads will return in order so we should:			# Global loads will return in order so we should:
	# s_waitcnt vmcnt(1) lgkmcnt(1)			# s_waitcnt vmcnt(1) lgkmcnt(1)
	# CHECK-NEXT: S_WAITCNT 369			# CHECK-NEXT: S_WAITCNT 369

	# CHECK-LABEL: bb.1:			# CHECK-LABEL: bb.1:
	# CHECK: FLAT_LOAD_DWORD			# CHECK: FLAT_LOAD_DWORD
	# CHECK: S_WAITCNT 368			# GFX89: S_WAITCNT 112
	# CHECK: FLAT_LOAD_DWORDX4			# CHECK: FLAT_LOAD_DWORDX4
	# The first load has no mem operand, so we should assume it accesses the flat
	# address space.
	# s_waitcnt lgkmcnt(1)
	# CHECK-NEXT: S_WAITCNT 383

	# CHECK-LABEL: bb.2:			# CHECK-LABEL: bb.2:
	# CHECK: FLAT_LOAD_DWORD			# CHECK: FLAT_LOAD_DWORD
	# CHECK: S_WAITCNT 368			# GFX89: S_WAITCNT 112
	# CHECK: FLAT_LOAD_DWORDX4			# CHECK: FLAT_LOAD_DWORDX4

	# One outstanding load accesses the flat address space.
	# s_waitcnt lgkmcnt(1)
	# CHECK-NEXT: S_WAITCNT 383

	name: flat_zero_waitcnt			name: flat_zero_waitcnt

	body: \|			body: \|
	bb.0:			bb.0:
	successors: %bb.1			successors: %bb.1
	$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)			$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
	$vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)			$vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
	$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec			$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
	▲ Show 20 Lines • Show All 71 Lines • Show Last 20 Lines