This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks
ClosedPublic

Authored by msearles on May 29 2018, 10:39 AM.

Download Raw Diff

Details

Reviewers

rampitec
cfang

Commits

rG1054541490db: [AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks
rL333556: [AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks

Summary

In terms of waitcnt insertion/if necessary, the waitcnt pass forces convergence for a loop. Previously, that kicked if greater than 2 passes over a loop, which doesn't account for loop with many bottom blocks. So, increase the threshold to (n+1), where n is the number of bottom blocks. This gives the pass an opportunity to consider the contribution of each bottom block, to the overall loop, before the forced convergence potentially kicks in.

Diff Detail

Repository: rL LLVM

Event Timeline

msearles created this revision.May 29 2018, 10:39 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptMay 29 2018, 10:39 AM

LGTM

This revision is now accepted and ready to land.May 29 2018, 10:43 AM

Closed by commit rL333556: [AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks (authored by msearles). · Explain WhyMay 30 2018, 8:51 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

SIInsertWaitcnts.cpp

42 lines

test/

CodeGen/

AMDGPU/

waitcnt-back-edge-loop.mir

34 lines

Diff 149132

llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Show First 20 Lines • Show All 339 Lines • ▼ Show 20 Lines
// at the end of the loop footer.		// at the end of the loop footer.
class LoopWaitcntData {		class LoopWaitcntData {
public:		public:
LoopWaitcntData() = default;		LoopWaitcntData() = default;
~LoopWaitcntData() = default;		~LoopWaitcntData() = default;

void incIterCnt() { IterCnt++; }		void incIterCnt() { IterCnt++; }
void resetIterCnt() { IterCnt = 0; }		void resetIterCnt() { IterCnt = 0; }
int32_t getIterCnt() { return IterCnt; }		unsigned getIterCnt() { return IterCnt; }

void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }		void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
MachineInstr *getWaitcnt() const { return LfWaitcnt; }		MachineInstr *getWaitcnt() const { return LfWaitcnt; }

void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }		void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }

private:		private:
// s_waitcnt added at the end of loop footer to stablize wait scores		// s_waitcnt added at the end of loop footer to stablize wait scores
▲ Show 20 Lines • Show All 843 Lines • ▼ Show 20 Lines	if (EmitWaitcnt != 0 \|\| IsForceEmitWaitcnt) {
if (!ScoreBracket) {		if (!ScoreBracket) {
assert(!BlockVisitedSet.count(TBB));		assert(!BlockVisitedSet.count(TBB));
BlockWaitcntBracketsMap[TBB] =		BlockWaitcntBracketsMap[TBB] =
llvm::make_unique<BlockWaitcntBrackets>();		llvm::make_unique<BlockWaitcntBrackets>();
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();		ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
}		}
ScoreBracket->setRevisitLoop(true);		ScoreBracket->setRevisitLoop(true);
LLVM_DEBUG(dbgs()		LLVM_DEBUG(dbgs()
<< "set-revisit: Block"		<< "set-revisit2: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);		<< ContainingLoop->getHeader()->getNumber() << '\n';);
}		}
}		}

// Update an existing waitcount, or make a new one.		// Update an existing waitcount, or make a new one.
unsigned Enc = AMDGPU::encodeWaitcnt(IV,		unsigned Enc = AMDGPU::encodeWaitcnt(IV,
ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],		ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],		ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
▲ Show 20 Lines • Show All 417 Lines • ▼ Show 20 Lines	#endif

// if a single block loop, update the score brackets. Not needed for other		// if a single block loop, update the score brackets. Not needed for other
// blocks, as we did this in-place		// blocks, as we did this in-place
if (IsSelfPred) {		if (IsSelfPred) {
BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);		BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
}		}
}		}

/// Return true if the given basic block is a "bottom" block of a loop. This		/// Return true if the given basic block is a "bottom" block of a loop.
/// differs from MachineLoop::getBottomBlock in that it works even if the loop		/// This works even if the loop is discontiguous. This also handles
/// is discontiguous. This also handles multiple back-edges for the same		/// multiple back-edges for the same "header" block of a loop.
/// "header" block of a loop.
bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,		bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
const MachineBasicBlock *Block) {		const MachineBasicBlock *Block) {
for (MachineBasicBlock *MBB : Loop->blocks()) {		for (MachineBasicBlock *MBB : Loop->blocks()) {
if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {		if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
return true;		return true;
}		}
}		}
return false;		return false;
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines	#endif
// Check if we need to force convergence at loop footer.		// Check if we need to force convergence at loop footer.
MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);		MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {		if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();		LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->print();		WaitcntData->print();
LLVM_DEBUG(dbgs() << '\n';);		LLVM_DEBUG(dbgs() << '\n';);

// The iterative waitcnt insertion algorithm aims for optimal waitcnt		// The iterative waitcnt insertion algorithm aims for optimal waitcnt
// placement and doesn't always guarantee convergence for a loop. Each		// placement, but doesn't guarantee convergence for a loop. Each
// loop should take at most 2 iterations for it to converge naturally.		// loop should take at most (n+1) iterations for it to converge naturally,
// When this max is reached and result doesn't converge, we force		// where n is the number of bottom blocks. If this threshold is reached and
// convergence by inserting a s_waitcnt at the end of loop footer.		// the result hasn't converged, then we force convergence by inserting
if (WaitcntData->getIterCnt() > 2) {		// a s_waitcnt at the end of loop footer.
		if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
// To ensure convergence, need to make wait events at loop footer be no		// To ensure convergence, need to make wait events at loop footer be no
// more than those from the previous iteration.		// more than those from the previous iteration.
// As a simplification, instead of tracking individual scores and		// As a simplification, instead of tracking individual scores and
// generating the precise wait count, just wait on 0.		// generating the precise wait count, just wait on 0.
bool HasPending = false;		bool HasPending = false;
MachineInstr *SWaitInst = WaitcntData->getWaitcnt();		MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;		for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {		T = (enum InstCounterType)(T + 1)) {
if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {		if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));		ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
HasPending = true;		HasPending = true;
		break;
}		}
}		}

if (HasPending) {		if (HasPending) {
if (!SWaitInst) {		if (!SWaitInst) {
SWaitInst = Block.getParent()->CreateMachineInstr(		SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
TII->get(AMDGPU::S_WAITCNT), DebugLoc());		DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
		.addImm(0);
TrackedWaitcntSet.insert(SWaitInst);		TrackedWaitcntSet.insert(SWaitInst);
const MachineOperand &Op = MachineOperand::CreateImm(0);
SWaitInst->addOperand(MF, Op);
#if 0 // TODO: Format the debug output		#if 0 // TODO: Format the debug output
OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);		OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
OutputTransformAdd(SWaitInst, context);		OutputTransformAdd(SWaitInst, context);
#endif		#endif
}		}
#if 0 // TODO: ??		#if 0 // TODO: ??
_DEV( REPORTED_STATS->force_waitcnt_converge = 1; )		_DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
#endif		#endif
▲ Show 20 Lines • Show All 80 Lines • ▼ Show 20 Lines	for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {		if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
unsigned Count = countNumBottomBlocks(ContainingLoop);		unsigned Count = countNumBottomBlocks(ContainingLoop);

// If the loop has multiple back-edges, and so more than one "bottom"		// If the loop has multiple back-edges, and so more than one "bottom"
// basic block, we have to guarantee a re-walk over every blocks.		// basic block, we have to guarantee a re-walk over every blocks.
if ((std::count(BlockWaitcntProcessedSet.begin(),		if ((std::count(BlockWaitcntProcessedSet.begin(),
BlockWaitcntProcessedSet.end(), &MBB) < Count)) {		BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);		BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
LLVM_DEBUG(dbgs() << "set-revisit: Block"		LLVM_DEBUG(dbgs() << "set-revisit1: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);		<< ContainingLoop->getHeader()->getNumber() << '\n';);
}		}
}		}

// Walk over the instructions.		// Walk over the instructions.
insertWaitcntInBlock(MF, MBB);		insertWaitcntInBlock(MF, MBB);

// Flag that waitcnts have been processed at least once.		// Record that waitcnts have been processed at least once for this block.
BlockWaitcntProcessedSet.push_back(&MBB);		BlockWaitcntProcessedSet.push_back(&MBB);

// See if we want to revisit the loop. If a loop has multiple back-edges,		// See if we want to revisit the loop. If a loop has multiple back-edges,
// we shouldn't revisit the same "bottom" basic block.		// we shouldn't revisit the same "bottom" basic block.
if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&		if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
std::count(BlockWaitcntProcessedSet.begin(),		std::count(BlockWaitcntProcessedSet.begin(),
BlockWaitcntProcessedSet.end(), &MBB) == 1) {		BlockWaitcntProcessedSet.end(), &MBB) == 1) {
MachineBasicBlock *EntryBB = ContainingLoop->getHeader();		MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (!MFI->isEntryFunction()) {		if (!MFI->isEntryFunction()) {
// Wait for any outstanding memory operations that the input registers may		// Wait for any outstanding memory operations that the input registers may
// depend on. We can't track them and it's better to the wait after the		// depend on. We can't track them and it's better to the wait after the
// costly call sequence.		// costly call sequence.

// TODO: Could insert earlier and schedule more liberally with operations		// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.		// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();		MachineBasicBlock &EntryBB = MF.front();
BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))		auto SWaitInst = BuildMI(EntryBB, EntryBB.getFirstNonPHI(),
		DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm(0);		.addImm(0);

		LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
		<< "New Instr: " << *SWaitInst << '\n');

Modified = true;		Modified = true;
}		}

return Modified;		return Modified;
}		}

llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir

Show First 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	bb.4:
S_BRANCH %bb.3		S_BRANCH %bb.3

bb.5:		bb.5:

$vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec		$vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec
EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec		EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec
S_ENDPGM		S_ENDPGM
...		...
		---

		# GCN-LABEL: name: waitcnt-multiple-back-edges{{$}}
		# GCN: bb.0:
		# GCN: S_WAITCNT 0
		# GCN-NEXT: S_BRANCH %bb.2

		name: waitcnt-multiple-back-edges
		body: \|
		bb.0:
		S_BRANCH %bb.2

		bb.1:
		S_BRANCH %bb.2

		bb.2:
		S_CBRANCH_VCCZ %bb.1, implicit $vcc

		bb.3:
		S_CBRANCH_VCCNZ %bb.5, implicit $vcc

		bb.4:
		BUFFER_ATOMIC_ADD_OFFSET renamable $vgpr0, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 0, 4, 0, implicit $exec
		S_CBRANCH_SCC0 %bb.2, implicit $scc
		S_BRANCH %bb.6

		bb.5:
		S_CBRANCH_SCC0 %bb.2, implicit $scc
		S_BRANCH %bb.6

		bb.6:
		S_CBRANCH_SCC1 %bb.0, implicit $scc
		S_ENDPGM
		...