This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU/SI: Better handle s_wait insertion
ClosedPublic

Authored by axeldavy on Aug 9 2015, 5:54 AM.

Download Raw Diff

Details

Reviewers

• tstellarAMD
arsenm

Commits

rGbd8a0856e2f9: AMDGPU/SI: Better handle s_wait insertion
rL245755: AMDGPU/SI: Better handle s_wait insertion

Summary

We can wait on either VM, EXP or LGKM.
The waits are independent.

Without this patch, a wait inserted because of one of them
would also wait for all the previous others.
This patch makes s_wait only wait for the ones we need for the next instruction.

Here's an example of subtle perf reduction this patch solves:

This is without the patch:

buffer_load_format_xyzw v[8:11], v0, s[44:47], 0 idxen
buffer_load_format_xyzw v[12:15], v0, s[48:51], 0 idxen
s_load_dwordx4 s[44:47], s[8:9], 0xc
s_waitcnt lgkmcnt(0)
buffer_load_format_xyzw v[16:19], v0, s[52:55], 0 idxen
s_load_dwordx4 s[48:51], s[8:9], 0x10
s_waitcnt vmcnt(1)
buffer_load_format_xyzw v[20:23], v0, s[44:47], 0 idxen

The s_waitcnt vmcnt(1) is useless.
The reason it is added is because the last
buffer_load_format_xyzw needs s[44:47], which was issued
by the first s_load_dwordx4. It waits for all VM
before that call to have finished.

Internally after every instruction, 3 counters (for VM, EXP and LGTM)
are updated after every instruction. For example buffer_load_format_xyzw will
increase the VM counter, and s_load_dwordx4 the LGKM one.

Without the patch, for every defined register,
the current 3 counters are stored, and are used to know
how long to wait when an instruction needs the register.

Because of that, the s[44:47] counter includes that to use the register
you need to wait for the previous buffer_load_format_xyzw.

Instead this patch stores only the counters that matter for the register,
and puts zero for the other ones, since we don't need any wait for them.

Diff Detail

Event Timeline

axeldavy updated this revision to Diff 31615.Aug 9 2015, 5:54 AM

axeldavy retitled this revision from to AMDGPU/SI: Better handle s_wait insertion.

axeldavy updated this object.

axeldavy added a subscriber: llvm-commits.

axeldavy added reviewers: arsenm, • tstellarAMD.Aug 9 2015, 6:06 AM

• tstellarAMD added inline comments.Aug 13 2015, 1:32 PM

test/CodeGen/AMDGPU/wait.ll
7–8	I would add a check for the instruction between these two s_waitcnt instructions. We want to make sure some future change doesn't regress us, and cause us to emit two s_waitcnt instructions in a row.
test/CodeGen/AMDGPU/wait2.ll
1 ↗	(On Diff #31615)	Is there are reason why this test was added to a new file and not to wait.ll ?

axeldavy added inline comments.Aug 13 2015, 10:15 PM

test/CodeGen/AMDGPU/wait.ll
7–8	OK sure
test/CodeGen/AMDGPU/wait2.ll
1 ↗	(On Diff #31615)	yes, the run command is different (and uses the ilpmax scheduler)

• tstellarAMD added inline comments.Aug 14 2015, 6:37 AM

test/CodeGen/AMDGPU/wait2.ll
1 ↗	(On Diff #31615)	You can have multiple run lines in the same test file, you just need to use a different --check-prefix if the output is different. It is preferred to put similar test cases in the same file if possible.

Take comments into account

Closed by commit rL245755: AMDGPU/SI: Better handle s_wait insertion (authored by tstellar). · Explain WhyAug 21 2015, 3:48 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIInsertWaits.cpp

7 lines

test/

CodeGen/

AMDGPU/

setcc-opt.ll

4 lines

wait.ll

57 lines

Diff 32245

lib/Target/AMDGPU/SIInsertWaits.cpp

Context not available.

	// Get the hardware counter increments and sum them up	// Get the hardware counter increments and sum them up
	Counters Increment = getHwCounts(*I);	Counters Increment = getHwCounts(*I);
		Counters Limit = ZeroCounts;
	unsigned Sum = 0;	unsigned Sum = 0;

	for (unsigned i = 0; i < 3; ++i) {	for (unsigned i = 0; i < 3; ++i) {
	LastIssued.Array[i] += Increment.Array[i];	LastIssued.Array[i] += Increment.Array[i];
		if (Increment.Array[i])
		Limit.Array[i] = LastIssued.Array[i];
	Sum += Increment.Array[i];	Sum += Increment.Array[i];
	}	}

Context not available.

	// Remember which registers we define	// Remember which registers we define
	if (Op.isDef())	if (Op.isDef())
	DefinedRegs[j] = LastIssued;	DefinedRegs[j] = Limit;

	// and which one we are using	// and which one we are using
	if (Op.isUse())	if (Op.isUse())
	UsedRegs[j] = LastIssued;	UsedRegs[j] = Limit;
	}	}
	}	}
	}	}
Context not available.

test/CodeGen/AMDGPU/setcc-opt.ll

Context not available.
	; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}	; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
	; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]]	; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]]
	; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc	; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
	; GCN-NEXT: buffer_store_byte [[RESULT]]	; GCN: buffer_store_byte [[RESULT]]
	; GCN: s_endpgm	; GCN: s_endpgm
	define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {	define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
	%b.ext = zext i8 %b to i32	%b.ext = zext i8 %b to i32
Context not available.
	; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}	; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
	; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}}	; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}}
	; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc	; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
	; GCN-NEXT: buffer_store_byte [[RESULT]]	; GCN: buffer_store_byte [[RESULT]]
	; GCN: s_endpgm	; GCN: s_endpgm
	define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {	define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
	%b.ext = sext i8 %b to i32	%b.ext = sext i8 %b to i32
Context not available.

test/CodeGen/AMDGPU/wait.ll

	; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s	; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=DEFAULT
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=DEFAULT
		; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=SI -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=ILPMAX
	; CHECK-LABEL: {{^}}main:	; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=ILPMAX
	; CHECK: s_load_dwordx4	; The ilpmax scheduler is used for the second test to get the ordering we want for the test.
	; CHECK: s_load_dwordx4
	; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}	; DEFAULT-LABEL: {{^}}main:
	; CHECK: s_endpgm	; DEFAULT: s_load_dwordx4
		tstellarAMDUnsubmitted Not Done Reply Inline Actions I would add a check for the instruction between these two s_waitcnt instructions. We want to make sure some future change doesn't regress us, and cause us to emit two s_waitcnt instructions in a row. tstellarAMD: I would add a check for the instruction between these two s_waitcnt instructions. We want to…
		axeldavyAuthorUnsubmitted Not Done Reply Inline Actions OK sure axeldavy: OK sure
		; DEFAULT: s_load_dwordx4
		; DEFAULT: s_waitcnt vmcnt(0)
		; DEFAULT: exp
		; DEFAULT: s_waitcnt lgkmcnt(0)
		; DEFAULT: s_endpgm
	define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {	define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
	main_body:	main_body:
	%tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0	%tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
Context not available.
	ret void	ret void
	}	}

		; ILPMAX-LABEL: {{^}}main2:
		; ILPMAX: s_load_dwordx4
		; ILPMAX: s_waitcnt lgkmcnt(0)
		; ILPMAX: buffer_load
		; ILPMAX: s_load_dwordx4
		; ILPMAX: s_waitcnt lgkmcnt(0)
		; ILPMAX: buffer_load
		; ILPMAX: s_waitcnt vmcnt(1)
		; ILPMAX: s_waitcnt vmcnt(0)
		; ILPMAX: s_endpgm

		define void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
		byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
		main_body:
		%11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
		%12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
		%13 = add i32 %5, %7
		%14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13)
		%15 = extractelement <4 x float> %14, i32 0
		%16 = extractelement <4 x float> %14, i32 1
		%17 = extractelement <4 x float> %14, i32 2
		%18 = extractelement <4 x float> %14, i32 3
		%19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1
		%20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0
		%21 = add i32 %5, %7
		%22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21)
		%23 = extractelement <4 x float> %22, i32 0
		%24 = extractelement <4 x float> %22, i32 1
		%25 = extractelement <4 x float> %22, i32 2
		%26 = extractelement <4 x float> %22, i32 3
		call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18)
		call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26)
		ret void
		}


	; Function Attrs: noduplicate nounwind	; Function Attrs: noduplicate nounwind
	declare void @llvm.AMDGPU.barrier.global() #1	declare void @llvm.AMDGPU.barrier.global() #1

Context not available.