This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AMDGPU/
-
Target/
-
AMDGPU/
1/2
SIInsertWaitcnts.cpp
-
test/CodeGen/AMDGPU/
-
CodeGen/
-
AMDGPU/
-
GlobalISel/
-
add.v2i16.ll
-
andn2.ll
-
artifact-combiner-asserts.ll
-
ashr.ll
-
br-constant-invalid-sgpr-copy.ll
1
bswap.ll
-
bug-legalization-artifact-combiner-dead-def.ll
-
clamp-fmed3-const-combine.ll
-
clamp-minmax-const-combine.ll
-
combine-fma-add-fma-mul.ll
-
combine-fma-add-mul.ll
-
combine-fma-sub-mul.ll
-
combine-fma-sub-neg-mul.ll
-
dynamic-alloca-uniform.ll
-
extractelement.i128.ll
-
extractelement.i16.ll
-
extractelement.i8.ll
-
extractelement.ll
-
fdiv.f16.ll
-
fdiv.f32.ll
-
fdiv.f64.ll
-
flat-scratch.ll
-
fma.ll
-
fmed3-min-max-const-combine.ll
-
fmul.v2f16.ll
-
fpow.ll
-
fshl.ll
-
fshr.ll
-
image-waterfall-loop-O0.ll
-
insertelement.ll
-
llvm.amdgcn.div.fmas.ll
-
llvm.amdgcn.ds.fadd.ll
-
llvm.amdgcn.ds.fmin.ll
-
llvm.amdgcn.fdot2.ll
-
llvm.amdgcn.fmul.legacy.ll
-
llvm.amdgcn.global.atomic.csub.ll
-
llvm.amdgcn.image.load.2d.ll
-
llvm.amdgcn.image.load.2darraymsaa.a16.ll
-
llvm.amdgcn.image.load.2darraymsaa.ll
-
llvm.amdgcn.image.load.3d.a16.ll
-
llvm.amdgcn.image.load.3d.ll
-
llvm.amdgcn.s.setreg.ll
-
llvm.amdgcn.sdot2.ll
-
llvm.amdgcn.sdot4.ll
-
llvm.amdgcn.sdot8.ll
-
llvm.amdgcn.sudot4.ll
-
llvm.amdgcn.sudot8.ll
-
llvm.amdgcn.trig.preop.ll
-
llvm.amdgcn.udot2.ll
-
llvm.amdgcn.udot4.ll
-
llvm.amdgcn.udot8.ll
-
llvm.powi.ll
-
load-local.128.ll
-
load-local.96.ll
-
load-unaligned.ll
-
lshr.ll
-
mul.ll
-
mul.v2i16.ll
-
orn2.ll
-
saddsat.ll
-
sbfx.ll
-
sext_inreg.ll
-
shl-ext-reduce.ll
-
shl.ll
-
shlN_add.ll
-
smed3.ll
-
ssubsat.ll
-
uaddsat.ll
-
ubfx.ll
-
umed3.ll
-
usubsat.ll
-
v_bfe_i32.ll
-
xnor.ll
-
zextload.ll
-
amd.endpgm.ll
-
atomicrmw-expand.ll
-
back-off-barrier-subtarget-feature.ll
-
bf16.ll
-
bfi_int.ll
-
bitreverse.ll
-
bswap.ll
-
bug-sdag-emitcopyfromreg.ll
-
call-argument-types.ll
-
calling-conventions.ll
-
cgp-addressing-modes-flat.ll
-
chain-hi-to-lo.ll
-
clamp-modifier.ll
-
combine-add-zext-xor.ll
-
cse-convergent.ll
-
cvt_f32_ubyte.ll
-
dagcombine-fma-fmad.ll
-
divergence-driven-buildvector.ll
-
expand-scalar-carry-out-select-user.ll
-
extract-subvector-16bit.ll
-
fast-unaligned-load-store.global.ll
-
fast-unaligned-load-store.private.ll
-
fcanonicalize.f16.ll
-
fcanonicalize.ll
-
fcopysign.f16.ll
-
fcopysign.f32.ll
-
fcopysign.f64.ll
-
flat-scratch.ll
-
fma.f16.ll
-
fmad-formation-fmul-distribute-denormal-mode.ll
-
fmax3.ll
-
fmax_legacy.f16.ll
-
fmin3.ll
-
fmin_legacy.f16.ll
-
fneg-combines.f16.ll
-
fneg-fold-legalize-dag-increase-insts.ll
-
fneg-modifier-casting.ll
-
fneg.ll
-
fold-fabs.ll
-
fp-min-max-flat-atomics.ll
-
fpext-free.ll
-
fpow.ll
-
fract-match.ll
-
fshr.ll
-
function-args.ll
-
function-returns.ll
-
gfx-callable-argument-types.ll
-
gfx-callable-preserved-registers.ll
-
gfx-callable-return-types.ll
-
imm16.ll
-
insert-delay-alu-bug.ll
-
integer-mad-patterns.ll
-
known-never-nan.ll
-
lds-frame-extern.ll
-
llvm.amdgcn.fma.legacy.ll
-
llvm.amdgcn.image.dim.ll
-
llvm.amdgcn.image.sample.d16.dim.ll
-
llvm.amdgcn.image.sample.dim.ll
-
llvm.amdgcn.struct.buffer.load.format.ll
-
llvm.amdgcn.struct.ptr.buffer.load.format.ll
-
llvm.amdgcn.sudot4.ll
-
llvm.amdgcn.sudot8.ll
-
llvm.frexp.ll
-
llvm.is.fpclass.f16.ll
-
llvm.is.fpclass.ll
-
llvm.ldexp.ll
-
llvm.log.ll
-
llvm.log10.ll
-
llvm.log2.ll
-
llvm.mulo.ll
-
llvm.powi.ll
-
lo16-hi16-physreg-copy.mir
-
load-local.128.ll
-
load-local.96.ll
-
mad-mix-hi.ll
-
mad-mix-lo.ll
-
mad-mix.ll
-
mad.u16.ll
-
mad_64_32.ll
-
mad_u64_u32.ll
-
memory_clause.ll
-
minmax.ll
-
module-lds-false-sharing.ll
-
mubuf-legalize-operands-non-ptr-intrinsics.ll
-
mubuf-legalize-operands.ll
-
offset-split-flat.ll
-
offset-split-global.ll
-
permute_i8.ll
-
preserve-hi16.ll
-
ptrmask.ll
-
reassoc-mul-add-1-to-mad.ll
-
roundeven.ll
-
saddsat.ll
-
select-constant-xor.ll
-
select-fabs-fneg-extract.f16.ll
-
select-fabs-fneg-extract.v2f16.ll
-
setcc-multiple-use.ll
-
skip-if-dead.ll
-
ssubsat.ll
-
store-weird-sizes.ll
-
strict_fadd.f16.ll
-
strict_fadd.f32.ll
-
strict_fadd.f64.ll
-
strict_fma.f16.ll
-
strict_fma.f32.ll
-
strict_fma.f64.ll
-
strict_fmul.f16.ll
-
strict_fmul.f32.ll
-
strict_fmul.f64.ll
-
strict_fpext.ll
-
strict_fptrunc.ll
-
strict_fsub.f16.ll
-
strict_fsub.f32.ll
-
strict_fsub.f64.ll
-
strict_ldexp.f16.ll
-
strict_ldexp.f32.ll
-
strict_ldexp.f64.ll
-
uaddsat.ll
-
udiv.ll
-
usubsat.ll
-
v_sat_pk_u8_i16.ll
-
vector_shuffle.packed.ll
-
vgpr-descriptor-waterfall-loop-idom-update.ll
-
vgpr-tuple-allocation.ll
-
waitcnt-bvh.mir
-
waitcnt-overflow.mir
-
waitcnt-preexisting-vscnt.mir
-
waitcnt-vscnt.ll
-
wave32.ll
-
wqm.ll

Differential D153537

[AMDGPU] Do not wait for vscnt on function entry and return
ClosedPublic

Authored by foad on Jun 22 2023, 5:31 AM.

Download Raw Diff

Details

Reviewers

nhaehnle
mareko
rampitec
kerbowa
stepthomas
arsenm

Group Reviewers

Restricted Project

Commits

rGf2c164c81505: [AMDGPU] Do not wait for vscnt on function entry and return

Summary

SIInsertWaitcnts inserts waitcnt instructions to resolve data
dependencies. The GFX10+ vscnt (VMEM store count) counter is never used
in this way. It is only used to resolve memory dependencies, and that is
handled by SIMemoryLegalizer. Hence there is no need to conservatively
wait for vscnt to be 0 on function entry and before returns.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

foad created this revision.Jun 22 2023, 5:31 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 22 2023, 5:31 AM

Herald added subscribers: bzcheeseman, StephenFan, wenlei and 7 others. · View Herald Transcript

foad requested review of this revision.Jun 22 2023, 5:31 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 22 2023, 5:31 AM

Herald added subscribers: llvm-commits, wdng. · View Herald Transcript

foad added inline comments.Jun 22 2023, 5:33 AM

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
1231	@kerbowa unlike the rest of SIInsertWaitcnts, I assume this part does want to wait for vscnt==0 since it is handling memory dependencies?

Harbormaster completed remote builds in B240481: Diff 533569.Jun 22 2023, 6:50 AM

arsenm added inline comments.Jun 22 2023, 7:24 AM

llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
70	These were super annoying

kerbowa added inline comments.Jun 25 2023, 4:42 PM

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
1231	Technically it should not matter on HW with VScnt since they all can back off barriers, so this 'if' should never be true on Navi. There is an exception currently with gfx11 because of the memory model description bug with cumode, but that should be temporary.

Ping!

This does not change the ABI. It just removes a bunch of wait instructions that were never required in the first place.

It is only used to resolve memory dependencies, and that is handled by SIMemoryLegalizer.

The idea was the stores on the caller side don't need to wait to finish because the prolog wait will take care of it after the latency of the jump. Is the memory legalizer not taking advantage of this?

It is only used to resolve memory dependencies, and that is handled by SIMemoryLegalizer.

The idea was the stores on the caller side don't need to wait to finish because the prolog wait will take care of it after the latency of the jump. Is the memory legalizer not taking advantage of this?

I don't see how it can be taking advantage of that, because it only considers each load or store in isolation.

In D153537#4468557, @foad wrote:

It is only used to resolve memory dependencies, and that is handled by SIMemoryLegalizer.

The idea was the stores on the caller side don't need to wait to finish because the prolog wait will take care of it after the latency of the jump. Is the memory legalizer not taking advantage of this?

I don't see how it can be taking advantage of that, because it only considers each load or store in isolation.

Would that just be an implementation deficiency? With something like https://godbolt.org/z/55YEn473W where does the wait happen?

With something like https://godbolt.org/z/55YEn473W where does the wait happen?

No wait is required. The store and the load stay in order. SIMemoryLegalizer does not insert one, irrespective of whether the store and load are in the same function or not.

LGTM I guess wait would only be needed in the atomic case anyway?

This revision is now accepted and ready to land.Jul 3 2023, 9:18 AM

Rebase.

This revision was landed with ongoing or failed builds.Jul 4 2023, 4:26 AM

Closed by commit rGf2c164c81505: [AMDGPU] Do not wait for vscnt on function entry and return (authored by foad). · Explain Why

This revision was automatically updated to reflect the committed changes.

foad added a commit: rGf2c164c81505: [AMDGPU] Do not wait for vscnt on function entry and return.

Harbormaster completed remote builds in B243009: Diff 537051.Jul 4 2023, 5:40 AM

Large Diff

This large diff affects 205 files. Files without inline comments have been collapsed. Expand All Files

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIInsertWaitcnts.cpp

18 lines

test/

CodeGen/

AMDGPU/

GlobalISel/

add.v2i16.ll

7 lines

andn2.ll

6 lines

artifact-combiner-asserts.ll

2 lines

ashr.ll

29 lines

br-constant-invalid-sgpr-copy.ll

4 lines

bswap.ll

10 lines

bug-legalization-artifact-combiner-dead-def.ll

2 lines

clamp-fmed3-const-combine.ll

8 lines

clamp-minmax-const-combine.ll

17 lines

combine-fma-add-fma-mul.ll

48 lines

combine-fma-add-mul.ll

56 lines

combine-fma-sub-mul.ll

60 lines

combine-fma-sub-neg-mul.ll

18 lines

dynamic-alloca-uniform.ll

12 lines

extractelement.i128.ll

10 lines

extractelement.i16.ll

28 lines

62 lines

32 lines

36 lines

62 lines

38 lines

20 lines

42 lines

fmed3-min-max-const-combine.ll

18 lines

20 lines

24 lines

190 lines

289 lines

image-waterfall-loop-O0.ll

2 lines

insertelement.ll

4 lines

llvm.amdgcn.div.fmas.ll

8 lines

llvm.amdgcn.ds.fadd.ll

5 lines

llvm.amdgcn.ds.fmin.ll

5 lines

llvm.amdgcn.fdot2.ll

1 line

llvm.amdgcn.fmul.legacy.ll

39 lines

llvm.amdgcn.global.atomic.csub.ll

8 lines

llvm.amdgcn.image.load.2d.ll

4 lines

llvm.amdgcn.image.load.2darraymsaa.a16.ll

4 lines

llvm.amdgcn.image.load.2darraymsaa.ll

4 lines

llvm.amdgcn.image.load.3d.a16.ll

4 lines

llvm.amdgcn.image.load.3d.ll

4 lines

llvm.amdgcn.s.setreg.ll

2 lines

llvm.amdgcn.sdot2.ll

13 lines

llvm.amdgcn.sdot4.ll

5 lines

llvm.amdgcn.sdot8.ll

4 lines

llvm.amdgcn.sudot4.ll

8 lines

llvm.amdgcn.sudot8.ll

8 lines

llvm.amdgcn.trig.preop.ll

2 lines

13 lines

6 lines

4 lines

12 lines

12 lines

12 lines

8 lines

31 lines

13 lines

4 lines

6 lines

27 lines

120 lines

28 lines

13 lines

33 lines

5 lines

10 lines

27 lines

25 lines

86 lines

10 lines

25 lines

1 line

4 lines

18 lines

2 lines

2 lines

back-off-barrier-subtarget-feature.ll

8 lines

55 lines

16 lines

2 lines

8 lines

bug-sdag-emitcopyfromreg.ll

2 lines

call-argument-types.ll

6 lines

calling-conventions.ll

38 lines

cgp-addressing-modes-flat.ll

12 lines

chain-hi-to-lo.ll

34 lines

clamp-modifier.ll

1 line

combine-add-zext-xor.ll

12 lines

cse-convergent.ll

2 lines

cvt_f32_ubyte.ll

48 lines

dagcombine-fma-fmad.ll

3 lines

divergence-driven-buildvector.ll

8 lines

expand-scalar-carry-out-select-user.ll

2 lines

extract-subvector-16bit.ll

7 lines

fast-unaligned-load-store.global.ll

6 lines

fast-unaligned-load-store.private.ll

36 lines

20 lines

7 lines

6 lines

12 lines

11 lines

74 lines

8 lines

fmad-formation-fmul-distribute-denormal-mode.ll

18 lines

1 line

10 lines

1 line

10 lines

116 lines

fneg-fold-legalize-dag-increase-insts.ll

1 line

fneg-modifier-casting.ll

47 lines

fneg.ll

5 lines

fold-fabs.ll

7 lines

fp-min-max-flat-atomics.ll

4 lines

21 lines

20 lines

69 lines

24 lines

124 lines

55 lines

gfx-callable-argument-types.ll

270 lines

gfx-callable-preserved-registers.ll

32 lines

gfx-callable-return-types.ll

29 lines

imm16.ll

36 lines

insert-delay-alu-bug.ll

2 lines

integer-mad-patterns.ll

60 lines

known-never-nan.ll

2 lines

lds-frame-extern.ll

3 lines

llvm.amdgcn.fma.legacy.ll

12 lines

llvm.amdgcn.image.dim.ll

30 lines

llvm.amdgcn.image.sample.d16.dim.ll

2 lines

llvm.amdgcn.image.sample.dim.ll

6 lines

llvm.amdgcn.struct.buffer.load.format.ll

8 lines

llvm.amdgcn.struct.ptr.buffer.load.format.ll

8 lines

llvm.amdgcn.sudot4.ll

8 lines

llvm.amdgcn.sudot8.ll

8 lines

llvm.frexp.ll

21 lines

llvm.is.fpclass.f16.ll

105 lines

40 lines

78 lines

65 lines

65 lines

61 lines

8 lines

12 lines

lo16-hi16-physreg-copy.mir

5 lines

12 lines

12 lines

10 lines

22 lines

38 lines

6 lines

17 lines

2 lines

6 lines

24 lines

module-lds-false-sharing.ll

2 lines

mubuf-legalize-operands-non-ptr-intrinsics.ll

12 lines

mubuf-legalize-operands.ll

12 lines

offset-split-flat.ll

95 lines

offset-split-global.ll

90 lines

permute_i8.ll

92 lines

preserve-hi16.ll

50 lines

ptrmask.ll

9 lines

reassoc-mul-add-1-to-mad.ll

83 lines

roundeven.ll

32 lines

saddsat.ll

9 lines

select-constant-xor.ll

11 lines

select-fabs-fneg-extract.f16.ll

45 lines

select-fabs-fneg-extract.v2f16.ll

45 lines

setcc-multiple-use.ll

1 line

37 lines

14 lines

6 lines

9 lines

14 lines

7 lines

16 lines

8 lines

8 lines

14 lines

14 lines

7 lines

161 lines

155 lines

23 lines

14 lines

7 lines

8 lines

4 lines

151 lines

12 lines

1 line

16 lines

8 lines

vector_shuffle.packed.ll

166 lines

vgpr-descriptor-waterfall-loop-idom-update.ll

4 lines

vgpr-tuple-allocation.ll

4 lines

waitcnt-bvh.mir

5 lines

waitcnt-overflow.mir

6 lines

waitcnt-preexisting-vscnt.mir

10 lines

waitcnt-vscnt.ll

4 lines

wave32.ll

6 lines

wqm.ll

21 lines

Diff 537052

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Show First 20 Lines • Show All 413 Lines • ▼ Show 20 Lines	public:

bool isForceEmitWaitcnt() const {		bool isForceEmitWaitcnt() const {
for (auto T : inst_counter_types())		for (auto T : inst_counter_types())
if (ForceEmitWaitcnt[T])		if (ForceEmitWaitcnt[T])
return true;		return true;
return false;		return false;
}		}

AMDGPU::Waitcnt allZeroWaitcnt() const {
return AMDGPU::Waitcnt::allZero(ST->hasVscnt());
}

void setForceEmitWaitcnt() {		void setForceEmitWaitcnt() {
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;		// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
// For debug builds, get the debug counter info and adjust if need be		// For debug builds, get the debug counter info and adjust if need be
#ifndef NDEBUG		#ifndef NDEBUG
if (DebugCounter::isCounterSet(ForceExpCounter) &&		if (DebugCounter::isCounterSet(ForceExpCounter) &&
DebugCounter::shouldExecute(ForceExpCounter)) {		DebugCounter::shouldExecute(ForceExpCounter)) {
ForceEmitWaitcnt[EXP_CNT] = true;		ForceEmitWaitcnt[EXP_CNT] = true;
} else {		} else {
▲ Show 20 Lines • Show All 597 Lines • ▼ Show 20 Lines	bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,

// All waits must be resolved at call return.		// All waits must be resolved at call return.
// NOTE: this could be improved with knowledge of all call sites or		// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.		// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|		if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
MI.getOpcode() == AMDGPU::SI_RETURN \|\|		MI.getOpcode() == AMDGPU::SI_RETURN \|\|
MI.getOpcode() == AMDGPU::S_SETPC_B64_return \|\|		MI.getOpcode() == AMDGPU::S_SETPC_B64_return \|\|
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {		(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(allZeroWaitcnt());		Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt());
}		}
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM		// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly		// stores. In this case it can be useful to send a message to explicitly
// release all VGPRs before the stores have completed, but it is only safe to		// release all VGPRs before the stores have completed, but it is only safe to
// do this if there are no outstanding scratch stores.		// do this if there are no outstanding scratch stores.
else if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|		else if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {		MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&		if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines	#endif
}		}

// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does		// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
// not, we need to ensure the subtarget is capable of backing off barrier		// not, we need to ensure the subtarget is capable of backing off barrier
// instructions in case there are any outstanding memory operations that may		// instructions in case there are any outstanding memory operations that may
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.		// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&		if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {		!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
Wait = Wait.combined(allZeroWaitcnt());		Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
		foadAuthorUnsubmitted Done Reply Inline Actions @kerbowa unlike the rest of SIInsertWaitcnts, I assume this part does want to wait for vscnt==0 since it is handling memory dependencies? foad: @kerbowa unlike the rest of SIInsertWaitcnts, I assume this part does want to wait for vscnt==0…
		kerbowaUnsubmitted Not Done Reply Inline Actions Technically it should not matter on HW with VScnt since they all can back off barriers, so this 'if' should never be true on Navi. There is an exception currently with gfx11 because of the memory model description bug with cumode, but that should be temporary. kerbowa: Technically it should not matter on HW with VScnt since they all can back off barriers, so this…
}		}

// TODO: Remove this work-around, enable the assert for Bug 457939		// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is		// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.		// independent of target.
if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {		if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {		if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
Wait.LgkmCnt = 0;		Wait.LgkmCnt = 0;
}		}
}		}

// Verify that the wait is actually needed.		// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);		ScoreBrackets.simplifyWaitcnt(Wait);

if (ForceEmitZeroWaitcnts)		if (ForceEmitZeroWaitcnts)
Wait = allZeroWaitcnt();		Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt();

if (ForceEmitWaitcnt[VM_CNT])		if (ForceEmitWaitcnt[VM_CNT])
Wait.VmCnt = 0;		Wait.VmCnt = 0;
if (ForceEmitWaitcnt[EXP_CNT])		if (ForceEmitWaitcnt[EXP_CNT])
Wait.ExpCnt = 0;		Wait.ExpCnt = 0;
if (ForceEmitWaitcnt[LGKM_CNT])		if (ForceEmitWaitcnt[LGKM_CNT])
Wait.LgkmCnt = 0;		Wait.LgkmCnt = 0;
if (ForceEmitWaitcnt[VS_CNT])
Wait.VsCnt = 0;

if (FlushVmCnt) {		if (FlushVmCnt) {
if (ScoreBrackets.hasPendingEvent(VM_CNT))		if (ScoreBrackets.hasPendingEvent(VM_CNT))
Wait.VmCnt = 0;		Wait.VmCnt = 0;
}		}

return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,		return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
OldWaitcntInstr);		OldWaitcntInstr);
▲ Show 20 Lines • Show All 206 Lines • ▼ Show 20 Lines	if (ST->vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() \|\| SIInstrInfo::isAtomicRet(Inst))) {		(Inst.mayStore() \|\| SIInstrInfo::isAtomicRet(Inst))) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);		ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}		}
} else if (TII->isSMRD(Inst)) {		} else if (TII->isSMRD(Inst)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);		ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
} else if (Inst.isCall()) {		} else if (Inst.isCall()) {
if (callWaitsOnFunctionReturn(Inst)) {		if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything		// Act as a wait on everything
ScoreBrackets->applyWaitcnt(allZeroWaitcnt());		ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
} else {		} else {
// May need to way wait for anything.		// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());		ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
}		}
} else if (SIInstrInfo::isLDSDIR(Inst)) {		} else if (SIInstrInfo::isLDSDIR(Inst)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);		ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
} else if (TII->isVINTERP(Inst)) {		} else if (TII->isVINTERP(Inst)) {
int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();		int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
▲ Show 20 Lines • Show All 365 Lines • ▼ Show 20 Lines	if (!MFI->isEntryFunction()) {
// TODO: Could insert earlier and schedule more liberally with operations		// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.		// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();		MachineBasicBlock &EntryBB = MF.front();
MachineBasicBlock::iterator I = EntryBB.begin();		MachineBasicBlock::iterator I = EntryBB.begin();
for (MachineBasicBlock::iterator E = EntryBB.end();		for (MachineBasicBlock::iterator E = EntryBB.end();
I != E && (I->isPHI() \|\| I->isMetaInstruction()); ++I)		I != E && (I->isPHI() \|\| I->isMetaInstruction()); ++I)
;		;
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);		BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
if (ST->hasVscnt())
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(0);

Modified = true;		Modified = true;
}		}

// Keep iterating over the blocks in reverse post order, inserting and		// Keep iterating over the blocks in reverse post order, inserting and
// updating s_waitcnt where needed, until a fix point is reached.		// updating s_waitcnt where needed, until a fix point is reached.
for (auto MBB : ReversePostOrderTraversal<MachineFunction >(&MF))		for (auto MBB : ReversePostOrderTraversal<MachineFunction >(&MF))
BlockInfos.insert({MBB, BlockInfo()});		BlockInfos.insert({MBB, BlockInfo()});
▲ Show 20 Lines • Show All 111 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Do not wait for vscnt on function entry and returnClosedPublic

Details

Diff Detail

Event Timeline

Large Diff

Revision Contents

Diff 537052

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/sbfx.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/ubfx.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll

[AMDGPU] Do not wait for vscnt on function entry and return
ClosedPublic