This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Flush vmcnt with any loop extraneous defs
Needs ReviewPublic

Authored by kerbowa on Jul 5 2023, 12:51 AM.

Download Raw Diff

Details

Reviewers

bsaleil
rochauha
foad
nhaehnle

Summary

Starts to hoist waitcnt in loops containing the use of a value that was loaded outside of the loop, which also has any VMEM load inside of the loop that defines a value that is used outside of the loop.

example:

v0 = load(...)
loop {
  ...
  use(v0)
  v1 = load(...)
  ...
  use(v1)
  v2 = load(...)
}
use(v2)

Previously we would not hoist waitcnt to the preheader of any loop which contained any use/def pairs that had any subregisters that were defined and used wholly within the loop. It seems somewhat arbitrary to limit the optimization to loops that only load values but never use them, but I may be missing something. While there is a concern about increased compile time with this change, it is essentially what was done before with FLAT/GLOBAL instructions.

A more thorough approach would try and estimate the minimum number of cycles gained or lost by hoisting the waitcnt, but this would involve further increases in compile time.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

kerbowa created this revision.Jul 5 2023, 12:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 5 2023, 12:51 AM

Herald added subscribers: StephenFan, hiraditya, tpr and 5 others. · View Herald Transcript

kerbowa requested review of this revision.Jul 5 2023, 12:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 5 2023, 12:52 AM

Herald added subscribers: llvm-commits, wdng. · View Herald Transcript

kerbowa edited the summary of this revision. (Show Details)Jul 5 2023, 12:54 AM

Harbormaster completed remote builds in B243147: Diff 537249.Jul 5 2023, 1:40 AM

kerbowa mentioned this in D154482: [AMDGPU] Flush vmcnt in preheader for loops with loads.Jul 5 2023, 1:41 AM

You haven't added any tests that show the effect of your patch.

arsenm added inline comments.Jul 5 2023, 10:14 AM

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
1770	How does this set distinguish sub and full register dfes?
1805	.empty()?
llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
788	-NEXT is much better than -NOT

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIInsertWaitcnts.cpp

33 lines

test/

CodeGen/

AMDGPU/

waitcnt-vmcnt-loop.mir

85 lines

Diff 537249

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Show First 20 Lines • Show All 1,751 Lines • ▼ Show 20 Lines	return SIInstrInfo::isVMEM(MI) \|\|
(SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));		(SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
}		}

// Return true if it is better to flush the vmcnt counter in the preheader of		// Return true if it is better to flush the vmcnt counter in the preheader of
// the given loop. We currently decide to flush in two situations:		// the given loop. We currently decide to flush in two situations:
// 1. The loop contains vmem store(s), no vmem load and at least one use of a		// 1. The loop contains vmem store(s), no vmem load and at least one use of a
// vgpr containing a value that is loaded outside of the loop. (Only on		// vgpr containing a value that is loaded outside of the loop. (Only on
// targets with no vscnt counter).		// targets with no vscnt counter).
// 2. The loop contains vmem load(s), but the loaded values are not used in the		// 2. The loop contains vmem load(s), but at least one of the loaded values is
// loop, and at least one use of a vgpr containing a value that is loaded		// not used in the loop, and at least one use of a vgpr containing a value
// outside of the loop.		// that is loaded outside of the loop.
bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,		bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
WaitcntBrackets &Brackets) {		WaitcntBrackets &Brackets) {
bool HasVMemLoad = false;		bool HasVMemLoad = false;
bool HasVMemStore = false;		bool HasVMemStore = false;
bool UsesVgprLoadedOutside = false;		bool UsesVgprLoadedOutside = false;
DenseSet<Register> VgprUse;		// Values that are defined within the loop but are not used within the same
DenseSet<Register> VgprDef;		// loop.
		DenseSet<Register> LoopExtraneousDefs;
		arsenmUnsubmitted Not Done Reply Inline Actions How does this set distinguish sub and full register dfes? arsenm: How does this set distinguish sub and full register dfes?

for (MachineBasicBlock *MBB : ML->blocks()) {		for (MachineBasicBlock *MBB : ML->blocks()) {
for (MachineInstr &MI : *MBB) {		for (MachineInstr &MI : *MBB) {
if (isVMEMOrFlatVMEM(MI)) {		if (isVMEMOrFlatVMEM(MI)) {
if (MI.mayLoad())		if (MI.mayLoad())
HasVMemLoad = true;		HasVMemLoad = true;
if (MI.mayStore())		if (MI.mayStore())
HasVMemStore = true;		HasVMemStore = true;
}		}
for (unsigned I = 0; I < MI.getNumOperands(); I++) {		for (unsigned I = 0; I < MI.getNumOperands(); I++) {
MachineOperand &Op = MI.getOperand(I);		MachineOperand &Op = MI.getOperand(I);
if (!Op.isReg() \|\| !TRI->isVectorRegister(*MRI, Op.getReg()))		if (!Op.isReg() \|\| !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;		continue;
RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);		RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
// Vgpr use		// Vgpr use
if (Op.isUse()) {		if (Op.isUse()) {
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {		for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
// If we find a register that is loaded inside the loop, 1. and 2.		LoopExtraneousDefs.erase(RegNo);
// are invalidated and we can exit.
if (VgprDef.contains(RegNo))
return false;
VgprUse.insert(RegNo);
// If at least one of Op's registers is in the score brackets, the		// If at least one of Op's registers is in the score brackets, the
// value is likely loaded outside of the loop.		// value is likely loaded outside of the loop.
if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {		if (Brackets.getRegScore(RegNo, VM_CNT) >
		Brackets.getScoreLB(VM_CNT))
UsesVgprLoadedOutside = true;		UsesVgprLoadedOutside = true;
break;
}
}		}
}		}
// VMem load vgpr def		// VMem load vgpr def
else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())		else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {		for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
// If we find a register that is loaded inside the loop, 1. and 2.		LoopExtraneousDefs.insert(RegNo);
// are invalidated and we can exit.
if (VgprUse.contains(RegNo))
return false;
VgprDef.insert(RegNo);
}
}		}
}		}
}		}
if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)		if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
return true;		return true;
return HasVMemLoad && UsesVgprLoadedOutside;		return LoopExtraneousDefs.size() && UsesVgprLoadedOutside;
		arsenmUnsubmitted Not Done Reply Inline Actions .empty()? arsenm: .empty()?
}		}

bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {		bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();		ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();		TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
IV = AMDGPU::getIsaVersion(ST->getCPU());		IV = AMDGPU::getIsaVersion(ST->getCPU());
▲ Show 20 Lines • Show All 165 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir

Show First 20 Lines • Show All 476 Lines • ▼ Show 20 Lines	bb.0:
S_BRANCH %bb.1		S_BRANCH %bb.1

bb.1:		bb.1:
successors: %bb.1, %bb.2		successors: %bb.1, %bb.2

$vgpr10 = COPY $vgpr0		$vgpr10 = COPY $vgpr0

$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)		$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
$vgpr11 = COPY $vgpr7		$vgpr11_vgpr12_vgpr13_vgpr14 = COPY $vgpr4_vgpr5_vgpr6_vgpr7
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc		S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc		S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2		S_BRANCH %bb.2

bb.2:		bb.2:
S_ENDPGM 0		S_ENDPGM 0

...		...
▲ Show 20 Lines • Show All 236 Lines • ▼ Show 20 Lines	bb.1:
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc		S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc		S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2		S_BRANCH %bb.2

bb.2:		bb.2:
S_ENDPGM 0		S_ENDPGM 0

...		...

		# The loop contains a use of a value that is defined outside of the loop, and
		# defines a value within the loop that is used outside of the loop.
		# We expect the waitcnt to be hoisted.

		# GFX9-LABEL: waitcnt_vm_loop_def_used_outside
		# GFX9-LABEL: bb.0:
		# GFX9: S_WAITCNT 39
		# GFX9-LABEL: bb.1:
		# GFX9-NOT: S_WAITCNT 39
		# GFX9-LABEL: bb.2:

		# GFX10-LABEL: waitcnt_vm_loop_def_used_outside
		# GFX10-LABEL: bb.0:
		# GFX10: S_WAITCNT 16
		# GFX10-LABEL: bb.1:
		# GFX10-NOT: S_WAITCNT 16
		# GFX10-LABEL: bb.2:
		name: waitcnt_vm_loop_def_used_outside
		body: \|
		bb.0:
		successors: %bb.1

		$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec

		S_BRANCH %bb.1

		bb.1:
		successors: %bb.1, %bb.2

		$vgpr10 = COPY $vgpr0

		$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
		S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
		S_CBRANCH_SCC1 %bb.1, implicit killed $scc
		S_BRANCH %bb.2

		bb.2:
		$vgpr11_vgpr12_vgpr13_vgpr14 = COPY $vgpr4_vgpr5_vgpr6_vgpr7
		S_ENDPGM 0

		...

		# Same as before, except only part of the interval is used outside of the loop.
		# We expect the waitcnt to be hoisted.

		# GFX9-LABEL: waitcnt_vm_loop_def_used_outside_partial_use
		# GFX9-LABEL: bb.0:
		# GFX9: S_WAITCNT 39
		# GFX9-LABEL: bb.1:
		# GFX9-NOT: S_WAITCNT 39
		arsenmUnsubmitted Not Done Reply Inline Actions -NEXT is much better than -NOT arsenm: -NEXT is much better than -NOT
		# GFX9-LABEL: bb.2:

		# GFX10-LABEL: waitcnt_vm_loop_def_used_outside_partial_use
		# GFX10-LABEL: bb.0:
		# GFX10: S_WAITCNT 16
		# GFX10-LABEL: bb.1:
		# GFX10-NOT: S_WAITCNT 16
		# GFX10-LABEL: bb.2:
		name: waitcnt_vm_loop_def_used_outside_partial_use
		body: \|
		bb.0:
		successors: %bb.1

		$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec

		S_BRANCH %bb.1

		bb.1:
		successors: %bb.1, %bb.2

		$vgpr10 = COPY $vgpr0

		$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
		S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
		S_CBRANCH_SCC1 %bb.1, implicit killed $scc
		S_BRANCH %bb.2

		bb.2:
		$vgpr11_vgpr12_vgpr13_vgpr14 = COPY $vgpr4_vgpr5_vgpr6_vgpr7
		S_ENDPGM 0

		...